Lines Matching full:url
4 # including code to check URL fragments.
13 File URL extension:
65 it based on the URL's suffix. The mimetypes.py module (also in this
100 rooturl -- URL to start checking
132 DEFROOT = "file:/usr/local/etc/httpd/htdocs/" # Default root URL
310 for url in self.bad.keys():
311 self.markerror(url)
331 url = urlparse.urljoin(root, "/robots.txt")
333 self.note(2, "Parsing %s", url)
335 rp.set_url(url)
339 self.note(1, "I/O error parsing %s: %s", url, msg)
348 for url in urls:
349 self.dopage(url)
379 # to the URL directly, since the URLs in these
380 # triples is now a (URL, fragment) pair. The value
382 # origins, and is a URL, not a pair.
383 for url, rawlink, msg in triples:
384 if rawlink != self.format_url(url): s = " (%s)" % rawlink
387 self.format_url(url), s, msg)
399 url, local_fragment = url_pair
409 # Dont actually mark the URL as bad - it exists, just
413 # Store the page which corresponds to this URL.
414 self.name_table[url] = page
426 origin = url, rawlink
434 def newlink(self, url, origin): argument
435 if self.done.has_key(url):
436 self.newdonelink(url, origin)
438 self.newtodolink(url, origin)
440 def newdonelink(self, url, origin): argument
441 if origin not in self.done[url]:
442 self.done[url].append(origin)
444 # Call self.format_url(), since the URL here
445 # is now a (URL, fragment) pair.
446 self.note(3, " Done link %s", self.format_url(url))
449 if self.bad.has_key(url):
451 triple = url, rawlink, self.bad[url]
454 def newtodolink(self, url, origin): argument
455 # Call self.format_url(), since the URL here
456 # is now a (URL, fragment) pair.
457 if self.todo.has_key(url):
458 if origin not in self.todo[url]:
459 self.todo[url].append(origin)
460 self.note(3, " Seen todo link %s", self.format_url(url))
462 self.todo[url] = [origin]
463 self.note(3, " New todo link %s", self.format_url(url))
465 def format_url(self, url): argument
466 link, fragment = url
470 def markdone(self, url): argument
471 self.done[url] = self.todo[url]
472 del self.todo[url]
475 def inroots(self, url): argument
477 if url[:len(root)] == root:
478 return self.isallowed(root, url)
481 def isallowed(self, root, url): argument
483 return self.robots[root].can_fetch(AGENTNAME, url)
486 # Incoming argument name is a (URL, fragment) pair.
488 url, fragment = url_pair
489 if self.name_table.has_key(url):
490 return self.name_table[url]
492 scheme, path = urllib.splittype(url)
494 self.note(1, " Not checking %s URL" % scheme)
496 isint = self.inroots(url)
498 # Ensure that openpage gets the URL pair to
511 if nurl != url:
513 url = nurl
515 return Page(text, url, maxpage=self.maxpage, checker=self)
517 # These next three functions take (URL, fragment) pairs as
521 url, fragment = url_pair
523 f, url = self.openhtml(url_pair)
527 return text, url
530 url, fragment = url_pair
533 url = f.geturl()
535 if not self.checkforhtml(info, url):
538 return f, url
541 url, fragment = url_pair
543 return self.urlopener.open(url)
548 self.show(" HREF ", url, " from", self.todo[url_pair])
552 def checkforhtml(self, info, url): argument
559 if url[-1:] == "/":
561 ctype, encoding = mimetypes.guess_type(url)
568 def setgood(self, url): argument
569 if self.bad.has_key(url):
570 del self.bad[url]
574 def setbad(self, url, msg): argument
575 if self.bad.has_key(url) and self.bad[url] == msg:
578 self.bad[url] = msg
580 self.markerror(url)
582 def markerror(self, url): argument
584 origins = self.todo[url]
586 origins = self.done[url]
588 triple = url, rawlink, self.bad[url]
591 def seterror(self, url, triple): argument
594 # check to make sure the URL hasn't been entered in the
596 # (URL, fragment) pair, but the URL key is not, since it's
598 if triple not in self.errors[url]:
599 self.errors[url].append(triple)
601 self.errors[url] = [triple]
631 url = f.geturl()
635 if url[:4] == 'ftp:' or url[:7] == 'file://':
663 def __init__(self, text, url, verbose=VERBOSE, maxpage=MAXPAGE, checker=None): argument
665 self.url = url
673 # the URL to MyHTMLParser().
676 self.note(0, "Skip huge file %s (%.0f Kbytes)", self.url, (size*0.001))
679 self.checker.note(2, " Parsing %s (%d bytes)", self.url, size)
680 self.parser = MyHTMLParser(url, verbose=self.verbose,
709 base = urlparse.urljoin(self.url, self.parser.getbase() or "")
726 def __init__(self, url, info): argument
727 self.__url = url
749 def http_error_401(self, url, fp, errcode, errmsg, headers): argument
752 def open_file(self, url): argument
753 path = urllib.url2pathname(urllib.unquote(url))
756 url = url + '/'
759 return self.open_file(url + "index.html")
766 s = MyStringIO("file:"+url, {'content-type': 'text/html'})
774 return urllib.FancyURLopener.open_file(self, url)
779 def __init__(self, url, verbose=VERBOSE, checker=None): argument
785 self.url = url
799 value, self.url)