1#! /usr/bin/env python
2
3"""A variant on webchecker that creates a mirror copy of a remote site."""
4
5__version__ = "$Revision$"
6
7import os
8import sys
9import urllib
10import getopt
11
12import webchecker
13
14# Extract real version number if necessary
15if __version__[0] == '$':
16    _v = __version__.split()
17    if len(_v) == 3:
18        __version__ = _v[1]
19
20def main():
21    verbose = webchecker.VERBOSE
22    try:
23        opts, args = getopt.getopt(sys.argv[1:], "qv")
24    except getopt.error, msg:
25        print msg
26        print "usage:", sys.argv[0], "[-qv] ... [rooturl] ..."
27        return 2
28    for o, a in opts:
29        if o == "-q":
30            verbose = 0
31        if o == "-v":
32            verbose = verbose + 1
33    c = Sucker()
34    c.setflags(verbose=verbose)
35    c.urlopener.addheaders = [
36            ('User-agent', 'websucker/%s' % __version__),
37        ]
38    for arg in args:
39        print "Adding root", arg
40        c.addroot(arg)
41    print "Run..."
42    c.run()
43
44class Sucker(webchecker.Checker):
45
46    checkext = 0
47    nonames = 1
48
49    # SAM 11/13/99: in general, URLs are now URL pairs.
50    # Since we've suppressed name anchor checking,
51    # we can ignore the second dimension.
52
53    def readhtml(self, url_pair):
54        url = url_pair[0]
55        text = None
56        path = self.savefilename(url)
57        try:
58            f = open(path, "rb")
59        except IOError:
60            f = self.openpage(url_pair)
61            if f:
62                info = f.info()
63                nurl = f.geturl()
64                if nurl != url:
65                    url = nurl
66                    path = self.savefilename(url)
67                text = f.read()
68                f.close()
69                self.savefile(text, path)
70                if not self.checkforhtml(info, url):
71                    text = None
72        else:
73            if self.checkforhtml({}, url):
74                text = f.read()
75            f.close()
76        return text, url
77
78    def savefile(self, text, path):
79        dir, base = os.path.split(path)
80        makedirs(dir)
81        try:
82            f = open(path, "wb")
83            f.write(text)
84            f.close()
85            self.message("saved %s", path)
86        except IOError, msg:
87            self.message("didn't save %s: %s", path, str(msg))
88
89    def savefilename(self, url):
90        type, rest = urllib.splittype(url)
91        host, path = urllib.splithost(rest)
92        path = path.lstrip("/")
93        user, host = urllib.splituser(host)
94        host, port = urllib.splitnport(host)
95        host = host.lower()
96        if not path or path[-1] == "/":
97            path = path + "index.html"
98        if os.sep != "/":
99            path = os.sep.join(path.split("/"))
100        path = os.path.join(host, path)
101        return path
102
103def makedirs(dir):
104    if not dir:
105        return
106    if os.path.exists(dir):
107        if not os.path.isdir(dir):
108            try:
109                os.rename(dir, dir + ".bak")
110                os.mkdir(dir)
111                os.rename(dir + ".bak", os.path.join(dir, "index.html"))
112            except os.error:
113                pass
114        return
115    head, tail = os.path.split(dir)
116    if not tail:
117        print "Huh?  Don't know how to make dir", dir
118        return
119    makedirs(head)
120    os.mkdir(dir, 0777)
121
122if __name__ == '__main__':
123    sys.exit(main() or 0)
124