Jezra.net: basic sitemap file maker in Python

2010-03-15

What I wanted was a simple way to generate a very basic sitemap file that would conform to the Sitemap protocol. Fortunately, I couldn't find something to do it for me so I had to write a python application to do it for me. But wait! Why stop there?

Since the application is basically a site crawler, I might as well add some feature bloat by checking for link errors and gathering some fairly useless information about the crawled site. To make this possible I utilized the OptionParser class from the optparse library so that I could pass in various command-line arguments to affect the way the app outputs data and to choose which URL to parse.

Without further ado... enter the Python:

#!/usr/bin/env python
import sys,urllib,re,os
from optparse import OptionParser
from HTMLParser import HTMLParser

class Parser(HTMLParser):
  def __init__(self,verbose=False):
    HTMLParser.__init__(self)
    self.clear()
    self.verbose = verbose
  def handle_starttag(self, tag, attrs):
    # we only care about 'a' tags
    if tag=="a":
      href = self.get_value_from_tuple_list("href",attrs)
      if href!=None:
        if self.verbose:
          print "found href: "+href
        if not href in self.hrefs:
          self.hrefs.append( href )
        
    elif tag=="base":
      href = self.get_value_from_tuple_list("href",attrs)
      if self.verbose:
          print "found base href: "+href

      if href!=None:
        self.base_href=href
        
  def get_hrefs(self):
    return self.hrefs
    
  def get_base_href(self):
    return self.base_href

  def get_value_from_tuple_list(self,target_key,list):
    for (key,value) in list:
      if key==target_key:
        return value
    return None
  
  def clear(self):
      self.hrefs=[]
      self.base_href=""

class JayWalker():
  browsable_files =[]
  valid_links = {}
  errors = {}
  processed_urls =[]
  depth = 0
  file_links = {}
  current_file_identifier=""

  webpage_extensions = ["htm","html","php","asp","jsp","py",""]
  def __init__(self,options):
    self.options=options
    
  def get_extension(self,str):
    #remove the start_url from the string
    s_string = str.replace(self.options.start_url,"")
    splits = s_string.split(".")
    if len(splits)>1:
      return splits[-1]
    else:
      return ""
      
  def walk(self):
    #process the start_url
    self.process_url(self.options.start_url)
    
    '''we are done, what sort of output should there be?'''
    #does the user want sitmap data?
    if self.options.sitemap:
      sm_text="<?xml version=\"1.0\" encoding=\"UTF-8\"?>"
      sm_text+="<urlset xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\">\n"
      if len(self.browsable_files)>0:
        for f in self.browsable_files:
          sm_text+="\t<url>\n"
          sm_text+="\t\t<loc>%s</loc>\n" % (f) 
          sm_text+="\t</url>\n"
        sm_text+="</urlset>\n"
    if self.options.outdir !=None:
      dir = self.options.outdir
      if not os.path.exists(dir):
        os.makedirs(dir)
      #make the errors
      errors_path = os.path.join(dir,"errors.txt")
      file = open(errors_path,"w")
      if len(self.errors)>0:
        for key in self.errors.keys():
          file.write( key +"\n")
          for page in self.errors[key]:
            file.write( "\t%s\n" % ( page ) )
      else:
        file.write("no errors")
      file.close()
      #make the errors
      links_path = os.path.join(dir,"links.txt")
      file = open(links_path,"w")
      if len(self.valid_links)>0:
        for key in self.valid_links.keys():
          file.write( key +"\n")
          for page in self.valid_links[key]:
            file.write( "\t%s\n" % ( page ) )
      else:
        file.write("no links")
      file.close()
      #make the browsable files
      browsables_path = os.path.join(dir,"browsables.txt")
      file = open(browsables_path,"w")
      if len(self.browsable_files)>0:
        for f in self.browsable_files:
          file.write("\t%s\n" % (f) )
      else:
        file.write("no browsables")
      file.close()
      #are we making a sitemap?
      if self.options.sitemap:
        sitemap_path = os.path.join(dir,"sitemap.xml")
        file = open(sitemap_path,"w")
        file.write(sm_text)
        file.close()
    else:
      print "--ERRORS--"
      if len(self.errors)>0:
        for key in self.errors.keys():
          print key
          for page in self.errors[key]:
            print "    %s" % ( page )
      else:
        print "no errors"
      
      print "--VALID LINKS--"
      if len(self.valid_links)>0:
        for key in self.valid_links.keys():
          print key
          for page in self.valid_links[key]:
            print "    %s" % ( page )
      else:
        print "no valid links"
      
      print "--BROWSABLE FILES--" 
      for f in self.browsable_files:
        print "    "+f
      
      if self.options.sitemap:
        print "--SITEMAP--"
        print sm_text
    
  def process_url(self,url,parent="root",depth=0):
    
    #add the url to our processed list
    self.processed_urls.append(url)
    
    self.file_links[url]=[]
    if self.options.be_verbose:
      print "processing "+url
    
    #is this a mailto?
    if url.startswith("mailto:") or url.startswith("MAILTO:"):
      if self.options.be_verbose:
        print "found mailto "+url
      #todo: record the mailto
      return
    
    #is this a link to an anchor?
    if url.startswith("#"):
      if self.options.be_verbose:
        print "found anchor link "+url
      #todo: record the mailto
      return
    
    #get a handle for the opened url file
    try:
      fh = urllib.urlopen(url)
      #what code did we get?
      code = int( fh.getcode() )
      if code>400:
        #record this error
        if not self.errors.has_key( parent ):
          self.errors[parent] = []
        self.errors[parent].append(url)
        
        if self.options.be_verbose:
          print "error %d" % ( code )
        return
      else:
        #record this good link
        if not self.valid_links.has_key( parent ):
          self.valid_links[parent] = []
        self.valid_links[parent].append(url)
        if self.options.be_verbose:
          print code
      
      #check the file extension
      ext = self.get_extension(url)
      if not ext in self.webpage_extensions and ext!="":
        if self.options.be_verbose:
          print "based on extension, we won't parse this file"
        return
      
      #check the content type, we only want html
      file_info = fh.info()
      content_type = file_info.getheader("Content-type")
      if content_type.endswith("html"):
        #this is good
        pass
      else:
        if self.options.be_verbose:
          print "bad content type: "+content_type
        return
      #this is a browsable file
      self.browsable_files.append(url)  
      #read the text from the file
      file_text = ""
      readfile = True
      while(readfile):
        temptext = fh.read(1028)
        file_text+=temptext
        if temptext==None or temptext =="":
          readfile = False
      fh.close()
      if self.options.be_verbose:
        print "reading "+url
      
      p = Parser(self.options.be_verbose)
      p.feed(file_text)
      
      self.file_links[url] = p.get_hrefs()
      base_href=p.get_base_href()
      
    except Exception as inst:
      print type(inst)     # the exception instance
      print inst.args      # arguments stored in .args
      print inst           # __str__ allows args to printed directly

      if self.options.be_verbose:
        print url+" is not a valid file"
      if not self.errors.has_key(parent):
        self.errors[parent] = []
      self.errors[parent].append(url)
      return
    
    if self.options.be_verbose:
      if len(self.file_links[url])>0:
        print "--unique links--"
        for link in self.file_links[url]:
          print "  "+link
      print "processed "+url
    
    #loop through the links
    for link in self.file_links[url]:
      parse_link=None;
      # if the url isn't absolute
      #TODO: find a better way to process anchored links
      if re.match( "^[A-Za-z]+:",link)==None and not link.startswith("#"):
        # is there a base href?
        if base_href!=None and base_href!="":
          parse_link = self.get_absolute_url(link,base_href,True)
        else:
          parse_link = self.get_absolute_url(link,url)
        
      else:
        #todo: check if this is an Absolute path on the site
        pass
        
      if parse_link!=None:
        if not parse_link in self.processed_urls:
          print parse_link
          self.process_url(parse_link,url)
    
  def get_absolute_url(self,url,parent,parent_is_base_href=False):
    #determine the parent's directory
    bits = parent.split("/")
    if not parent_is_base_href and parent.count("/")>2:
      del(bits[-1])
    #how may ../ are in the url?
    up_count = url.count("../")
    #remove that many dirs from the parent bits
    for x in range(up_count):
      print x
      sys.exit()
      del(bits[-1])
    #remove the ../ from the url
    url.replace("../","")
    parent_root = "/".join(bits)
    
    if url.startswith("/") :
      return_url = parent_root+url
    else:
      return_url = parent_root+"/"+url
    return return_url
    
    
class Application:
  def __init__(self):
    #we need an option parser
    self.opt_parser = OptionParser()
    #what do we need from the user?
    self.opt_parser.add_option("-u", "--url", dest="start_url",
      help="The URL to start processing", metavar="URL")
    self.opt_parser.add_option("-o", "--out", dest="outdir",
      help="save output to a specific directory", metavar="dir")
    self.opt_parser.add_option("-s", "--sitemap",
                  action="store_true", dest="sitemap", default=False,
                  help="include generic sitemap data in the output")
    self.opt_parser.add_option("-v", "--verbose",
                  action="store_true", dest="be_verbose", default=False,
                  help="display what is going on")
    
  def run(self):
    #parse the options
    (options, args) = self.opt_parser.parse_args()
    #if there is no url,we quit
    if options.start_url==None:
      print "ERROR: You did not specify a URL to process"
      print ""
      sys.exit()
    #we need a JayWalker
    jw = JayWalker(options)
    jw.walk()

if __name__=="__main__":
  app = Application()
  app.run()

On my computer this file is named "walk.py", and to make a sitemap for my website I run

./walk.py -u http://www.jezra.net -s -o output

The command makes a directory named "output" and creates files with information about my site as well as a sitemap file named "sitemap.xml", which I then upload to my website.

Well there you have it. Copy, edit, learn, or ignore.

Now quit reading, and go crawl with Python.

basic sitemap file maker in Python from jezra.net

Comments

2010-03-15 James:

Nice idea. I'm going to take this and build a little bit on it. Interested in feedback?

2010-03-16 jezra:

James, since the code does what I need it to do, I'm very interested in how you change the code to do what you need it to do.

Name:		not required
Email:		not required (will not be displayed)
Website:		not required (will link your name to your site)
Comment:		required Please do not post HTML code or bbcode unless you want it to show up as code in your post. (or if you are a blog spammer, in which case, you probably aren't reading this anyway).
Prove you are human by solving a math problem! I'm sorry, but due to an increase of blog spam, I've had to implement a CAPTCHA.
Problem:	0 plus 1
Answer:		required