[release-2.26] Refactor and expand download_hash.py (#11538)

* download_hash.py: generalized and data-driven

The script is currently limited to one hardcoded URL for kubernetes
related binaries, and a fixed set of architectures.

The solution is three-fold:
1. Use an url template dictionary for each download -> this allow to easily
   add support for new downloads.
2. Source the architectures to search from the existing data
3. Enumerate the existing versions in the data and start searching from
   the last one until no newer version is found (newer in the version
   order sense, irrespective of actual age)

* download_hash.py: support for 'multi-hash' file + runc

runc upstream does not provide one hash file per assets in their
releases, but one file with all the hashes.
To handle this (and/or any arbitrary format from upstreams), add a
dictionary mapping the name of the download to a lambda function which
transform the file provided by upstream into a dictionary of hashes,
keyed by architecture.

* download_hash: argument handling with argparse

Allow the script to be called with a list of components, to only
download new versions checksums for those.
By default, we get new versions checksums for all supported (by the
script) components.

* download_hash: propagate new patch versions to all archs

* download_hash: add support for 'simple hash' components

* download_hash: support 'multi-hash' components

* download_hash: document missing support

* download_hash: use persistent session

This allows to reuse http connection and be more efficient.
From rough measuring it saves around 25-30% of execution time.

* download_hash: cache request for 'multi-hash' files

This avoid re-downloading the same file for different arch and
re-parsing it

* download_hash: document usage

---------

Co-authored-by: Max Gautier <mg@max.gautier.name>
pull/11615/head
k8s-infra-cherrypick-robot 2024-09-16 00:39:14 -07:00 committed by GitHub
parent f9ebd45c74
commit d80686acb0
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
1 changed files with 159 additions and 30 deletions

View File

@ -6,8 +6,10 @@
import sys
from itertools import count
from itertools import count, groupby
from collections import defaultdict
from functools import cache
import argparse
import requests
from ruamel.yaml import YAML
from packaging.version import Version
@ -25,36 +27,144 @@ def open_checksums_yaml():
return data, yaml
def version_compare(version):
return Version(version.removeprefix("v"))
def download_hash(minors):
architectures = ["arm", "arm64", "amd64", "ppc64le"]
downloads = ["kubelet", "kubectl", "kubeadm"]
downloads = {
"calicoctl_binary": "https://github.com/projectcalico/calico/releases/download/{version}/SHA256SUMS",
"ciliumcli_binary": "https://github.com/cilium/cilium-cli/releases/download/{version}/cilium-{os}-{arch}.tar.gz.sha256sum",
"cni_binary": "https://github.com/containernetworking/plugins/releases/download/{version}/cni-plugins-{os}-{arch}-{version}.tgz.sha256",
"containerd_archive": "https://github.com/containerd/containerd/releases/download/v{version}/containerd-{version}-{os}-{arch}.tar.gz.sha256sum",
"crictl": "https://github.com/kubernetes-sigs/cri-tools/releases/download/{version}/critest-{version}-{os}-{arch}.tar.gz.sha256",
"crio_archive": "https://storage.googleapis.com/cri-o/artifacts/cri-o.{arch}.{version}.tar.gz.sha256sum",
"etcd_binary": "https://github.com/etcd-io/etcd/releases/download/{version}/SHA256SUMS",
"kubeadm": "https://dl.k8s.io/release/{version}/bin/linux/{arch}/kubeadm.sha256",
"kubectl": "https://dl.k8s.io/release/{version}/bin/linux/{arch}/kubectl.sha256",
"kubelet": "https://dl.k8s.io/release/{version}/bin/linux/{arch}/kubelet.sha256",
"nerdctl_archive": "https://github.com/containerd/nerdctl/releases/download/v{version}/SHA256SUMS",
"runc": "https://github.com/opencontainers/runc/releases/download/{version}/runc.sha256sum",
"skopeo_binary": "https://github.com/lework/skopeo-binary/releases/download/{version}/skopeo-{os}-{arch}.sha256",
"yq": "https://github.com/mikefarah/yq/releases/download/{version}/checksums-bsd", # see https://github.com/mikefarah/yq/pull/1691 for why we use this url
}
# TODO: downloads not supported
# youki: no checkusms in releases
# kata: no checksums in releases
# gvisor: sha512 checksums
# crun : PGP signatures
# cri_dockerd: no checksums or signatures
# helm_archive: PGP signatures
# krew_archive: different yaml structure
# calico_crds_archive: different yaml structure
# TODO:
# noarch support -> k8s manifests, helm charts
# different checksum format (needs download role changes)
# different verification methods (gpg, cosign) ( needs download role changes) (or verify the sig in this script and only use the checksum in the playbook)
# perf improvements (async)
def download_hash(only_downloads: [str]) -> None:
# Handle file with multiples hashes, with various formats.
# the lambda is expected to produce a dictionary of hashes indexed by arch name
download_hash_extract = {
"calicoctl_binary": lambda hashes : {
line.split('-')[-1] : line.split()[0]
for line in hashes.strip().split('\n')
if line.count('-') == 2 and line.split('-')[-2] == "linux"
},
"etcd_binary": lambda hashes : {
line.split('-')[-1].removesuffix('.tar.gz') : line.split()[0]
for line in hashes.strip().split('\n')
if line.split('-')[-2] == "linux"
},
"nerdctl_archive": lambda hashes : {
line.split()[1].removesuffix('.tar.gz').split('-')[3] : line.split()[0]
for line in hashes.strip().split('\n')
if [x for x in line.split(' ') if x][1].split('-')[2] == "linux"
},
"runc": lambda hashes : {
parts[1].split('.')[1] : parts[0]
for parts in (line.split()
for line in hashes.split('\n')[3:9])
},
"yq": lambda rhashes_bsd : {
pair[0].split('_')[-1] : pair[1]
# pair = (yq_<os>_<arch>, <hash>)
for pair in ((line.split()[1][1:-1], line.split()[3])
for line in rhashes_bsd.splitlines()
if line.startswith("SHA256"))
if pair[0].startswith("yq")
and pair[0].split('_')[1] == "linux"
and not pair[0].endswith(".tar.gz")
},
}
data, yaml = open_checksums_yaml()
if not minors:
minors = {'.'.join(minor.split('.')[:-1]) for minor in data["kubelet_checksums"]["amd64"].keys()}
s = requests.Session()
for download in downloads:
@cache
def _get_hash_by_arch(download: str, version: str) -> {str: str}:
hash_file = s.get(downloads[download].format(
version = version,
os = "linux",
),
allow_redirects=True)
if hash_file.status_code == 404:
print(f"Unable to find {download} hash file for version {version} at {hash_file.url}")
return None
hash_file.raise_for_status()
return download_hash_extract[download](hash_file.content.decode())
for download, url in (downloads if only_downloads == []
else {k:downloads[k] for k in downloads.keys() & only_downloads}).items():
checksum_name = f"{download}_checksums"
data[checksum_name] = defaultdict(dict, data[checksum_name])
for arch in architectures:
for minor in minors:
if not minor.startswith("v"):
minor = f"v{minor}"
for release in (f"{minor}.{patch}" for patch in count(start=0, step=1)):
if release in data[checksum_name][arch]:
# Propagate new patch versions to all architectures
for arch in data[checksum_name].values():
for arch2 in data[checksum_name].values():
arch.update({
v:("NONE" if arch2[v] == "NONE" else 0)
for v in (set(arch2.keys()) - set(arch.keys()))
if v.split('.')[2] == '0'})
# this is necessary to make the script indempotent,
# by only adding a vX.X.0 version (=minor release) in each arch
# and letting the rest of the script populate the potential
# patch versions
for arch, versions in data[checksum_name].items():
for minor, patches in groupby(versions.copy().keys(), lambda v : '.'.join(v.split('.')[:-1])):
for version in (f"{minor}.{patch}" for patch in
count(start=int(max(patches, key=version_compare).split('.')[-1]),
step=1)):
# Those barbaric generators do the following:
# Group all patches versions by minor number, take the newest and start from that
# to find new versions
if version in versions and versions[version] != 0:
continue
hash_file = requests.get(f"https://dl.k8s.io/release/{release}/bin/linux/{arch}/{download}.sha256", allow_redirects=True)
if hash_file.status_code == 404:
print(f"Unable to find {download} hash file for release {release} (arch: {arch})")
break
hash_file.raise_for_status()
sha256sum = hash_file.content.decode().strip()
if download in download_hash_extract:
hashes = _get_hash_by_arch(download, version)
if hashes == None:
break
sha256sum = hashes.get(arch)
if sha256sum == None:
break
else:
hash_file = s.get(downloads[download].format(
version = version,
os = "linux",
arch = arch
),
allow_redirects=True)
if hash_file.status_code == 404:
print(f"Unable to find {download} hash file for version {version} (arch: {arch}) at {hash_file.url}")
break
hash_file.raise_for_status()
sha256sum = hash_file.content.decode().split()[0]
if len(sha256sum) != 64:
raise Exception(f"Checksum has an unexpected length: {len(sha256sum)} (binary: {download}, arch: {arch}, release: 1.{minor}.{patch})")
data[checksum_name][arch][release] = sha256sum
raise Exception(f"Checksum has an unexpected length: {len(sha256sum)} (binary: {download}, arch: {arch}, release: {version}, checksum: '{sha256sum}')")
data[checksum_name][arch][version] = sha256sum
data[checksum_name] = {arch : {r : releases[r] for r in sorted(releases.keys(),
key=lambda v : Version(v[1:]),
key=version_compare,
reverse=True)}
for arch, releases in data[checksum_name].items()}
@ -62,15 +172,34 @@ def download_hash(minors):
yaml.dump(data, checksums_yml)
print(f"\n\nUpdated {CHECKSUMS_YML}\n")
parser = argparse.ArgumentParser(description=f"Add new patch versions hashes in {CHECKSUMS_YML}",
formatter_class=argparse.RawTextHelpFormatter,
epilog=f"""
This script only lookup new patch versions relative to those already existing
in the data in {CHECKSUMS_YML},
which means it won't add new major or minor versions.
In order to add one of these, edit {CHECKSUMS_YML}
by hand, adding the new versions with a patch number of 0 (or the lowest relevant patch versions)
; then run this script.
def usage():
print(f"USAGE:\n {sys.argv[0]} [k8s_version1] [[k8s_version2]....[k8s_versionN]]")
Note that the script will try to add the versions on all
architecture keys already present for a given download target.
The '0' value for a version hash is treated as a missing hash, so the script will try to download it again.
To notify a non-existing version (yanked, or upstream does not have monotonically increasing versions numbers),
use the special value 'NONE'.
def main(argv=None):
download_hash(sys.argv[1:])
return 0
EXAMPLES:
crictl_checksums:
...
amd64:
+ v1.30.0: 0
v1.29.0: d16a1ffb3938f5a19d5c8f45d363bd091ef89c0bc4d44ad16b933eede32fdcbb
v1.28.0: 8dc78774f7cbeaf787994d386eec663f0a3cf24de1ea4893598096cb39ef2508"""
if __name__ == "__main__":
sys.exit(main())
)
parser.add_argument('binaries', nargs='*', choices=downloads.keys())
args = parser.parse_args()
download_hash(args.binaries)