webchecker.py - OpenGrok cross reference for /external/python/cpython2/Tools/webchecker/webchecker.py

Lines Matching full:url
4 # including code to check URL fragments.
13 File URL extension:
65 it based on the URL's suffix.  The mimetypes.py module (also in this
100 rooturl   -- URL to start checking
132 DEFROOT = "file:/usr/local/etc/httpd/htdocs/"   # Default root URL
310         for url in self.bad.keys():
311             self.markerror(url)
331         url = urlparse.urljoin(root, "/robots.txt")
333         self.note(2, "Parsing %s", url)
335         rp.set_url(url)
339             self.note(1, "I/O error parsing %s: %s", url, msg)
348             for url in urls:
349                 self.dopage(url)
379             # to the URL directly, since the URLs in these
380             # triples is now a (URL, fragment) pair. The value
382             # origins, and is a URL, not a pair.
383             for url, rawlink, msg in triples:
384                 if rawlink != self.format_url(url): s = " (%s)" % rawlink
387                              self.format_url(url), s, msg)
399         url, local_fragment = url_pair
409             # Dont actually mark the URL as bad - it exists, just
413             # Store the page which corresponds to this URL.
414             self.name_table[url] = page
426                 origin = url, rawlink
434     def newlink(self, url, origin):  argument
435         if self.done.has_key(url):
436             self.newdonelink(url, origin)
438             self.newtodolink(url, origin)
440     def newdonelink(self, url, origin):  argument
441         if origin not in self.done[url]:
442             self.done[url].append(origin)
444         # Call self.format_url(), since the URL here
445         # is now a (URL, fragment) pair.
446         self.note(3, "  Done link %s", self.format_url(url))
449         if self.bad.has_key(url):
451             triple = url, rawlink, self.bad[url]
454     def newtodolink(self, url, origin):  argument
455         # Call self.format_url(), since the URL here
456         # is now a (URL, fragment) pair.
457         if self.todo.has_key(url):
458             if origin not in self.todo[url]:
459                 self.todo[url].append(origin)
460             self.note(3, "  Seen todo link %s", self.format_url(url))
462             self.todo[url] = [origin]
463             self.note(3, "  New todo link %s", self.format_url(url))
465     def format_url(self, url):  argument
466         link, fragment = url
470     def markdone(self, url):  argument
471         self.done[url] = self.todo[url]
472         del self.todo[url]
475     def inroots(self, url):  argument
477             if url[:len(root)] == root:
478                 return self.isallowed(root, url)
481     def isallowed(self, root, url):  argument
483         return self.robots[root].can_fetch(AGENTNAME, url)
486         # Incoming argument name is a (URL, fragment) pair.
488         url, fragment = url_pair
489         if self.name_table.has_key(url):
490             return self.name_table[url]
492         scheme, path = urllib.splittype(url)
494             self.note(1, " Not checking %s URL" % scheme)
496         isint = self.inroots(url)
498         # Ensure that openpage gets the URL pair to
511         if nurl != url:
513             url = nurl
515             return Page(text, url, maxpage=self.maxpage, checker=self)
517     # These next three functions take (URL, fragment) pairs as
521         url, fragment = url_pair
523         f, url = self.openhtml(url_pair)
527         return text, url
530         url, fragment = url_pair
533             url = f.geturl()
535             if not self.checkforhtml(info, url):
538         return f, url
541         url, fragment = url_pair
543             return self.urlopener.open(url)
548                 self.show(" HREF ", url, "  from", self.todo[url_pair])
552     def checkforhtml(self, info, url):  argument
559             if url[-1:] == "/":
561             ctype, encoding = mimetypes.guess_type(url)
568     def setgood(self, url):  argument
569         if self.bad.has_key(url):
570             del self.bad[url]
574     def setbad(self, url, msg):  argument
575         if self.bad.has_key(url) and self.bad[url] == msg:
578         self.bad[url] = msg
580         self.markerror(url)
582     def markerror(self, url):  argument
584             origins = self.todo[url]
586             origins = self.done[url]
588             triple = url, rawlink, self.bad[url]
591     def seterror(self, url, triple):  argument
594             # check to make sure the URL hasn't been entered in the
596             # (URL, fragment) pair, but the URL key is not, since it's
598             if triple not in self.errors[url]:
599                 self.errors[url].append(triple)
601             self.errors[url] = [triple]
631             url = f.geturl()
635             if url[:4] == 'ftp:' or url[:7] == 'file://':
663     def __init__(self, text, url, verbose=VERBOSE, maxpage=MAXPAGE, checker=None):  argument
665         self.url = url
673         # the URL to MyHTMLParser().
676             self.note(0, "Skip huge file %s (%.0f Kbytes)", self.url, (size*0.001))
679         self.checker.note(2, "  Parsing %s (%d bytes)", self.url, size)
680         self.parser = MyHTMLParser(url, verbose=self.verbose,
709         base = urlparse.urljoin(self.url, self.parser.getbase() or "")
726     def __init__(self, url, info):  argument
727         self.__url = url
749     def http_error_401(self, url, fp, errcode, errmsg, headers):  argument
752     def open_file(self, url):  argument
753         path = urllib.url2pathname(urllib.unquote(url))
756                 url = url + '/'
759                 return self.open_file(url + "index.html")
766             s = MyStringIO("file:"+url, {'content-type': 'text/html'})
774         return urllib.FancyURLopener.open_file(self, url)
779     def __init__(self, url, verbose=VERBOSE, checker=None):  argument
785         self.url = url
799                                          value, self.url)