Remove broken download stats workflow

The "arduino-stats" GitHub Actions workflow was designed to periodically gather download statistics from Arduino CDN and
push results to Datadog.

The recorded stats from the identical system in the Arduino CLI repository showed a periodic decrease in total download
count. Since this is patently impossible, it is clear that something is wrong with the system and that the recorded data
is not trustworthy. An investigation into the problem
was never done.

On 2022-03-14, the runs of the "arduino-stats" GitHub Actions workflow began to fail. Because there had not been any
relevant change in the repository between the last successful run and the first failing run, it seems that some external
change caused the breakage.

The workflow also uses deprecated Node.js 12 runtime-based actions and set-output workflow command, which currently
results in warnings printed to the workflow run summary page, but will eventually cause the complete breakage of the
workflow.

Since the workflow was not ever working successfully and the lack of an investigation about that indicates that the
stats are not of immediate importance, the best course of action is to simply remove the broken infrastructure from the
repository rather than investing time into fixing something that isn't being used anyway.
This commit is contained in:
per1234 2022-10-30 16:31:23 -07:00
parent c29452a858
commit 9cd03bec46
2 changed files with 0 additions and 188 deletions

View File

@ -1,131 +0,0 @@
import boto3
import semver
import os
import logging
import uuid
import time
# logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
log = logging.getLogger()
logging.getLogger("boto3").setLevel(logging.CRITICAL)
logging.getLogger("botocore").setLevel(logging.CRITICAL)
logging.getLogger("urllib3").setLevel(logging.CRITICAL)
def execute(client, statement, dest_s3_output_location):
log.info("execute query: {} dumping in {}".format(statement, dest_s3_output_location))
result = client.start_query_execution(
QueryString=statement,
ClientRequestToken=str(uuid.uuid4()),
ResultConfiguration={
"OutputLocation": dest_s3_output_location,
},
)
execution_id = result["QueryExecutionId"]
log.info("wait for query {} completion".format(execution_id))
wait_for_query_execution_completion(client, execution_id)
log.info("operation successful")
return execution_id
def wait_for_query_execution_completion(client, query_execution_id):
query_ended = False
while not query_ended:
query_execution = client.get_query_execution(QueryExecutionId=query_execution_id)
state = query_execution["QueryExecution"]["Status"]["State"]
if state == "SUCCEEDED":
query_ended = True
elif state in ["FAILED", "CANCELLED"]:
raise BaseException(
"query failed or canceled: {}".format(query_execution["QueryExecution"]["Status"]["StateChangeReason"])
)
else:
time.sleep(1)
def valid(key):
split = key.split("_")
if len(split) < 1:
return False
try:
semver.parse(split[0])
except ValueError:
return False
return True
def get_results(client, execution_id):
results_paginator = client.get_paginator("get_query_results")
results_iter = results_paginator.paginate(QueryExecutionId=execution_id, PaginationConfig={"PageSize": 1000})
res = {}
for results_page in results_iter:
for row in results_page["ResultSet"]["Rows"][1:]:
# Loop through the JSON objects
key = row["Data"][0]["VarCharValue"]
if valid(key):
res[key] = row["Data"][1]["VarCharValue"]
return res
def convert_data(data):
result = []
for key, value in data.items():
# 0.18.0_macOS_64bit.tar.gz
split_key = key.split("_")
if len(split_key) != 3:
continue
(version, os_version, arch) = split_key
arch_split = arch.split(".")
if len(arch_split) < 1:
continue
arch = arch_split[0]
if len(arch) > 10:
# This can't be an architecture really.
# It's an ugly solution but works for now so deal with it.
continue
repo = os.environ["GITHUB_REPOSITORY"].split("/")[1]
result.append(
{
"type": "gauge",
"name": "arduino.downloads.total",
"value": value,
"host": os.environ["GITHUB_REPOSITORY"],
"tags": [
f"version:{version}",
f"os:{os_version}",
f"arch:{arch}",
"cdn:downloads.arduino.cc",
f"project:{repo}",
],
}
)
return result
if __name__ == "__main__":
DEST_S3_OUTPUT = os.environ["AWS_ATHENA_OUTPUT_LOCATION"]
AWS_ATHENA_SOURCE_TABLE = os.environ["AWS_ATHENA_SOURCE_TABLE"]
session = boto3.session.Session(region_name="us-east-1")
athena_client = session.client("athena")
# Load all partitions before querying downloads
execute(athena_client, f"MSCK REPAIR TABLE {AWS_ATHENA_SOURCE_TABLE};", DEST_S3_OUTPUT)
query = f"""SELECT replace(json_extract_scalar(url_decode(url_decode(querystring)),
'$.data.url'), 'https://downloads.arduino.cc/arduino-ide/arduino-ide_', '')
AS flavor, count(json_extract(url_decode(url_decode(querystring)),'$')) AS gauge
FROM {AWS_ATHENA_SOURCE_TABLE}
WHERE json_extract_scalar(url_decode(url_decode(querystring)),'$.data.url')
LIKE 'https://downloads.arduino.cc/arduino-ide/arduino-ide_%'
AND json_extract_scalar(url_decode(url_decode(querystring)),'$.data.url')
NOT LIKE '%latest%' -- exclude latest redirect
group by 1 ;"""
exec_id = execute(athena_client, query, DEST_S3_OUTPUT)
results = get_results(athena_client, exec_id)
result_json = convert_data(results)
print(f"::set-output name=result::{result_json}")

View File

@ -1,57 +0,0 @@
name: arduino-stats
on:
schedule:
# run every day at 07:00 AM, 03:00 PM and 11:00 PM
- cron: "0 7,15,23 * * *"
workflow_dispatch:
repository_dispatch:
jobs:
push-stats:
# This workflow is only of value to the arduino/arduino-ide repository and
# would always fail in forks
if: github.repository == 'arduino/arduino-ide'
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v2
- uses: actions/setup-python@v2
with:
python-version: '3.x'
- name: Fetch downloads count form Arduino CDN using AWS Athena
id: fetch
env:
AWS_ACCESS_KEY_ID: ${{ secrets.STATS_AWS_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.STATS_AWS_SECRET_ACCESS_KEY }}
AWS_ATHENA_SOURCE_TABLE: ${{ secrets.STATS_AWS_ATHENA_SOURCE_TABLE }}
AWS_ATHENA_OUTPUT_LOCATION: ${{ secrets.STATS_AWS_ATHENA_OUTPUT_LOCATION }}
GITHUB_REPOSITORY: ${{ github.repository }}
run: |
pip install boto3 semver
python .github/tools/fetch_athena_stats.py
- name: Send metrics
uses: masci/datadog@v1
with:
api-key: ${{ secrets.DD_API_KEY }}
# Metrics input expects YAML but JSON will work just right.
metrics: ${{steps.fetch.outputs.result}}
- name: Report failure
if: failure()
uses: masci/datadog@v1
with:
api-key: ${{ secrets.DD_API_KEY }}
events: |
- title: "Arduino IDE stats failing"
text: "Stats collection failed"
alert_type: "error"
host: ${{ github.repository }}
tags:
- "project:arduino-ide"
- "cdn:downloads.arduino.cc"
- "workflow:${{ github.workflow }}"