1#!/usr/bin/env python3.8
2
3import argparse
4import os
5import json
6
7from typing import Dict, Any
8from urllib.request import urlretrieve
9
10argparser = argparse.ArgumentParser(
11    prog="download_pypi_packages", description="Helper program to download PyPI packages",
12)
13argparser.add_argument(
14    "-n", "--number", type=int, default=100, help="Number of packages to download"
15)
16argparser.add_argument(
17    "-a", "--all", action="store_true", help="Download all packages listed in the json file"
18)
19
20
21def load_json(filename: str) -> Dict[Any, Any]:
22    with open(os.path.join("data", f"{filename}.json"), "r") as f:
23        j = json.loads(f.read())
24    return j
25
26
27def remove_json(filename: str) -> None:
28    path = os.path.join("data", f"{filename}.json")
29    os.remove(path)
30
31
32def download_package_json(package_name: str) -> None:
33    url = f"https://pypi.org/pypi/{package_name}/json"
34    urlretrieve(url, os.path.join("data", f"{package_name}.json"))
35
36
37def download_package_code(name: str, package_json: Dict[Any, Any]) -> None:
38    source_index = -1
39    for idx, url_info in enumerate(package_json["urls"]):
40        if url_info["python_version"] == "source":
41            source_index = idx
42            break
43    filename = package_json["urls"][source_index]["filename"]
44    url = package_json["urls"][source_index]["url"]
45    urlretrieve(url, os.path.join("data", "pypi", filename))
46
47
48def main() -> None:
49    args = argparser.parse_args()
50    number_packages = args.number
51    all_packages = args.all
52
53    top_pypi_packages = load_json("top-pypi-packages-365-days")
54    if all_packages:
55        top_pypi_packages = top_pypi_packages["rows"]
56    elif number_packages >= 0 and number_packages <= 4000:
57        top_pypi_packages = top_pypi_packages["rows"][:number_packages]
58    else:
59        raise AssertionError("Unknown value for NUMBER_OF_PACKAGES")
60
61    try:
62        os.mkdir(os.path.join("data", "pypi"))
63    except FileExistsError:
64        pass
65
66    for package in top_pypi_packages:
67        package_name = package["project"]
68
69        print(f"Downloading JSON Data for {package_name}... ", end="")
70        download_package_json(package_name)
71        print("Done")
72
73        package_json = load_json(package_name)
74        try:
75            print(f"Dowloading and compressing package {package_name} ... ", end="")
76            download_package_code(package_name, package_json)
77            print("Done")
78        except (IndexError, KeyError):
79            print(f"Could not locate source for {package_name}")
80            continue
81        finally:
82            remove_json(package_name)
83
84
85if __name__ == "__main__":
86    main()
87