Tools/buildbot/fetch_data_files.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61

"""A helper to download input files needed by assorted encoding tests.

fetch_data_files.py [directory]

Files are downloaded to directory `directory`.  If a directory isn't given,
it defaults to the current directory (.).
"""

DATA_URLS = """
    http://people.freebsd.org/~perky/i18n/BIG5HKSCS.TXT
    http://people.freebsd.org/~perky/i18n/EUC-CN.TXT
    http://people.freebsd.org/~perky/i18n/EUC-JISX0213.TXT
    http://people.freebsd.org/~perky/i18n/EUC-JP.TXT
    http://people.freebsd.org/~perky/i18n/EUC-KR.TXT
    http://people.freebsd.org/~perky/i18n/SHIFT_JISX0213.TXT

    http://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP932.TXT
    http://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP936.TXT
    http://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP949.TXT
    http://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP950.TXT

    http://www.unicode.org/Public/3.2-Update/NormalizationTest-3.2.0.txt

    http://www.unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/JIS/SHIFTJIS.TXT
    http://www.unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/KSC/JOHAB.TXT
    http://www.unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/OTHER/BIG5.TXT
"""

# Adapted from test_support.open_urlresource() in Python 2.5.
# Fetch the file give by `url` off the web, and store it in directory
# `directory`.  The file name is extracted from the last URL component.
# If the file already exists, it's not fetched again.
def fetch_file_from_url(url, directory):
    import urllib, urlparse
    import os.path

    filename = urlparse.urlparse(url)[2].split('/')[-1] # '/': it's a URL!
    target = os.path.join(directory, filename)
    if os.path.exists(target):
        print "\tskipping %r -- already exists" % target
    else:
        print "\tfetching %s ..." % url
        urllib.urlretrieve(url, target)

def main(urls, directory):
    print "Downloading data files to %r" % directory
    for url in urls.split():
        fetch_file_from_url(url, directory)

if __name__ == "__main__":
    import sys

    n = len(sys.argv)
    if n == 1:
        directory = "."
    elif n == 2:
        directory = sys.argv[1]
    else:
        raise ValueError("no more than one argument allowed")

    main(DATA_URLS, directory)