# parse_html_attribute.py # Great help from http://bytes.com/topic/python/answers/660853-simpleprograms-challenge # http://stackoverflow.com/questions/1335507/keyboard-input-with-timeout-in-python # http://code.activestate.com/recipes/59892/ # https://github.com/collective/collective.xmpp.core/blob/master/collective/xmpp/core/browser/transforms.py # ... thanks # RJM Programming # R. Metcalfe # January, 2015 # # Examples of usage: # echo "My link" | python parse_html_attribute.py title # ... results in ... # My title # # python parse_html_attribute.py x.html # ... results in list of href (which is default) attributes with HTML of x.html file in current directory ... # # python parse_html_attribute.py < x.html # ... results in same list as above ... # # python parse_html_attribute.py http://localhost:8888/x.html # ... results in list of href (which is default) attributes with HTML of URL http://localhost:8888/x.html ... # # python parse_html_attribute.py http://localhost:8888/x.html style # ... results in list of style attributes with HTML of URL http://localhost:8888/x.html ... # # python parse_html_attribute.py # ... results in nothing (after 5 seconds ... timeout) ... from HTMLParser import HTMLParser import sys import urllib2 import socket import ssl from urlparse import urlparse import signal import os.path TIMEOUT = 5 # number of seconds you want for timeout def interrupted(signum, frame): "called when read times out" sys.exit() class URLLister(HTMLParser): def getUrlContent(self, url): try: return urllib2.urlopen(url, timeout=5).read() except ssl.SSLError: return None except urllib2.URLError: return None except socket.timeout: return None except socket.error: return None except urllib2.URLError: return None def reset(self): HTMLParser.reset(self) self.urls = [] def handle_starttag(self, tag, attrs): try: # get handler for tag and call it e.g. self.start_a getattr(self, "start_%s" % tag)(attrs) except ssl.SSLError: pass except urllib2.URLError: pass except socket.timeout: pass except socket.error: pass except AttributeError: pass def start_a(self, attrs): href = [v for k, v in attrs if k == atype] if href: self.urls.extend(href) parser = URLLister() url = "" hfound = 0 dfound = 0 one = 1 atype = "href" try: # Set URL to that defined on the command line url = sys.argv[1] try: dfound = url.lower().index(".") except ValueError: atype = url url = "" if (dfound >= 0): one = 2 except IndexError: url = "" try: # Set attribute type to look for atype = sys.argv[one] except IndexError: pass try: if (url == ""): # Get URL (or file) contents piped in, or from stdin redirected via input file signal.signal(signal.SIGALRM, interrupted) signal.alarm(TIMEOUT) try: parser.feed(sys.stdin.read()) except: pass else: try: hfound = url.lower().index("http://") except ValueError: try: hfound = url.lower().index("https://") except ValueError: try: hfound = url.lower().index("file://") except ValueError: try: hfound = -999 if (os.path.isfile(url)): hfound = -1 except: pass if (hfound >= 0): # Get URL contents from URL defined on the command line content = parser.getUrlContent(url) elif (hfound == -1): # Open a file whose name was a parameter on command line fo = open(url, "r+") # Get URL contents from contents of this file content = fo.read(); # Close opened file fo.close() else: content = "" parser.feed(content) signal.alarm(0) parser.close() for url in parser.urls: print url except urllib2.URLError as e: print type(e) except socket.timeout as e: print type(e)