#!/usr/bin/python # # Barf to stdout a list of all links on the web page(s) given as URL # arguments on the command line. I have decided to write this because # wget is annoying to use to spider certain classes of files from a # single web page. import sys, urlparse __pychecker__ = 'no-abstract no-stdlib errors' import urllib import HTMLParser __pychecker__ = '' def warn(str): sys.stderr.write("%s: %s\n" % (sys.argv[0], str)) # We do not want to do authentication, dude. class MyUrlOpener(urllib.FancyURLopener): def get_user_passwd(self, host, realm, clear_cache = 0): __pychecker__ = "no-argsused" return None, None def fetchurl(url): try: opener = MyUrlOpener() u = opener.open(url) page = u.read() return page except EnvironmentError, e: warn("could not fetch %s: %s" % (url, str(e))) return None # This grovels through a HTML component and extracts all of the URLs # that it mentions as a href's or img src's, or form post actions. # Always assuming that they actually exist as non-null things; null # ones can result from, eg, '', which defines href in the # dictionary but as 'None' (I believe). class HTMLUrls(HTMLParser.HTMLParser): def __init__(self): self._urls = [] HTMLParser.HTMLParser.__init__(self) def handle_starttag(self, tag, attrs): d = dict(attrs) u = None tag = tag.lower() if tag == 'a' and d.has_key('href'): u = d['href'] if u: self._urls.append(u) def geturls(self): return self._urls def gethtmlurls(string): __pychecker__ = 'no-abstract' H = HTMLUrls() urls = [] try: H.feed(string) H.close() return H.geturls() except HTMLParser.HTMLParseError, e: warn("cannot parse HTML") # If we cannot parse the section as HTML, punt. return [] def process(args): for a in args: page = fetchurl(a) if not page: continue urls = gethtmlurls(page) if not urls: continue purls = {} for u in urls: if u[0] == '#' or u.startswith("javascript:"): continue # urljoin does all the heavy lifting of handling # relative references for us. u = urlparse.urljoin(a, u) if u in purls: continue print u purls[u] = None if __name__ == "__main__": process(sys.argv[1:])