#!/usr/bin/python
#
# Barf to stdout a list of all links on the web page(s) given as URL
# arguments on the command line. I have decided to write this because
# wget is annoying to use to spider certain classes of files from a
# single web page.
import sys, urlparse

__pychecker__ = 'no-abstract no-stdlib errors'
import urllib
import HTMLParser
__pychecker__ = ''

def warn(str):
	sys.stderr.write("%s: %s\n" % (sys.argv[0], str))

# We do not want to do authentication, dude.
class MyUrlOpener(urllib.FancyURLopener):
	def get_user_passwd(self, host, realm, clear_cache = 0):
		__pychecker__ = "no-argsused"
		return None, None

def fetchurl(url):
	try:
		opener = MyUrlOpener()
		u = opener.open(url)
		page = u.read()
		return page
	except EnvironmentError, e:
		warn("could not fetch %s: %s" % (url, str(e)))
		return None

# This grovels through a HTML component and extracts all of the URLs
# that it mentions as a href's or img src's, or form post actions.
# Always assuming that they actually exist as non-null things; null
# ones can result from, eg, '<a href>', which defines href in the
# dictionary but as 'None' (I believe).
class HTMLUrls(HTMLParser.HTMLParser):
	def __init__(self):
		self._urls = []
		HTMLParser.HTMLParser.__init__(self)

	def handle_starttag(self, tag, attrs):
		d = dict(attrs)
		u = None
		tag = tag.lower()
		if tag == 'a' and d.has_key('href'):
			u = d['href']
		if u:
			self._urls.append(u)
	def geturls(self):
		return self._urls

def gethtmlurls(string):
	__pychecker__ = 'no-abstract'
	H = HTMLUrls()
	urls = []
	try:
		H.feed(string)
		H.close()
		return H.geturls()
	except HTMLParser.HTMLParseError, e:
		warn("cannot parse HTML")
		# If we cannot parse the section as HTML, punt.
		return []

def process(args):
	for a in args:
		page = fetchurl(a)
		if not page:
			continue
		urls = gethtmlurls(page)
		if not urls:
			continue
		purls = {}
		for u in urls:
			if u[0] == '#' or u.startswith("javascript:"):
				continue
			# urljoin does all the heavy lifting of handling
			# relative references for us.
			u = urlparse.urljoin(a, u)
			if u in purls:
				continue
			print u
			purls[u] = None

if __name__ == "__main__":
	process(sys.argv[1:])
