# parse_html_attribute.py
# Great help from http://bytes.com/topic/python/answers/660853-simpleprograms-challenge
#                 http://stackoverflow.com/questions/1335507/keyboard-input-with-timeout-in-python
#                 http://code.activestate.com/recipes/59892/
#                 https://github.com/collective/collective.xmpp.core/blob/master/collective/xmpp/core/browser/transforms.py
#                 ... thanks
# RJM Programming
# R. Metcalfe
# January, 2015
#
# Examples of usage:
# echo "<html><body><a href='x.html' title='My title'>My link</a></body></html>" | python parse_html_attribute.py title 
#    ... results in ...
# My title
#
# python parse_html_attribute.py x.html
#    ... results in list of href (which is default) attributes with HTML of x.html file in current directory ...
#
# python parse_html_attribute.py < x.html
#    ... results in same list as above ...
#
# python parse_html_attribute.py http://localhost:8888/x.html
#    ... results in list of href (which is default) attributes with HTML of URL http://localhost:8888/x.html ...
#
# python parse_html_attribute.py http://localhost:8888/x.html style
#    ... results in list of style attributes with HTML of URL http://localhost:8888/x.html ...
#
# python parse_html_attribute.py 
#    ... results in nothing (after 5 seconds ... timeout) ...


from HTMLParser import HTMLParser
import sys
import urllib2
import socket
import ssl
from urlparse import urlparse
import signal
import os.path

TIMEOUT = 5 # number of seconds you want for timeout


def interrupted(signum, frame):
  "called when read times out"
  sys.exit()
    

class URLLister(HTMLParser):

 def getUrlContent(self, url):
  try:
   return urllib2.urlopen(url, timeout=5).read()
  except ssl.SSLError:
   return None
  except urllib2.URLError:
   return None
  except socket.timeout:
   return None
  except socket.error:
   return None
  except urllib2.URLError:
   return None
   
 def reset(self):
  HTMLParser.reset(self)
  self.urls = []

 def handle_starttag(self, tag, attrs):
  try:
   # get handler for tag and call it e.g. self.start_a
   getattr(self, "start_%s" % tag)(attrs)
  except ssl.SSLError:
   pass
  except urllib2.URLError:
   pass
  except socket.timeout:
   pass
  except socket.error:
   pass
  except AttributeError:
   pass

 def start_a(self, attrs):
  href = [v for k, v in attrs if k == atype]
  if href:
   self.urls.extend(href)

parser = URLLister()
url = "" 
hfound = 0
dfound = 0
one = 1
atype = "href"

try:
    # Set URL to that defined on the command line
    url = sys.argv[1]
    try:
     dfound = url.lower().index(".")
    except ValueError:
     atype = url
     url = ""

    if (dfound >= 0):
     one = 2
except IndexError:
    url = ""

try:
    # Set attribute type to look for 
    atype = sys.argv[one]
except IndexError:
    pass

try:
    if (url == ""):
      # Get URL (or file) contents piped in, or from stdin redirected via input file
      signal.signal(signal.SIGALRM, interrupted)
      signal.alarm(TIMEOUT)
      try:
       parser.feed(sys.stdin.read())
      except:
       pass
    else:
      try:
       hfound = url.lower().index("http://")
      except ValueError:
       try:
        hfound = url.lower().index("https://")
       except ValueError:
        try:
         hfound = url.lower().index("file://")
        except ValueError:
         try:
          hfound = -999
          if (os.path.isfile(url)):
           hfound = -1
        
         except:
          pass       
      
      if (hfound >= 0):
        # Get URL contents from URL defined on the command line
        content = parser.getUrlContent(url)
      elif (hfound == -1):
        # Open a file whose name was a parameter on command line
        fo = open(url, "r+")
        # Get URL contents from contents of this file
        content = fo.read();
        # Close opened file
        fo.close()      
      else:
        content = ""
      
      parser.feed(content)
      
    signal.alarm(0)
    parser.close()
    for url in parser.urls: 
     print url
except urllib2.URLError as e:
    print type(e)    
except socket.timeout as e:
    print type(e)    


