2010-03-15
What I wanted was a simple way to generate a very basic sitemap file that would conform to the Sitemap protocol. Fortunately, I couldn't find something to do it for me so I had to write a python application to do it for me. But wait! Why stop there?

Since the application is basically a site crawler, I might as well add some feature bloat by checking for link errors and gathering some fairly useless information about the crawled site. To make this possible I utilized the OptionParser class from the optparse library so that I could pass in various command-line arguments to affect the way the app outputs data and to choose which URL to parse.

Without further ado... enter the Python:
#!/usr/bin/env python import sys,urllib,re,os from optparse import OptionParser from HTMLParser import HTMLParser class Parser(HTMLParser):   def __init__(self,verbose=False):     HTMLParser.__init__(self)     self.clear()     self.verbose verbose   def handle_starttag(selftagattrs):     # we only care about 'a' tags     if tag=="a":       href self.get_value_from_tuple_list("href",attrs)       if href!=None:         if self.verbose:           print "found href: "+href         if not href in self.hrefs:           self.hrefs.appendhref )              elif tag=="base":       href self.get_value_from_tuple_list("href",attrs)       if self.verbose:           print "found base href: "+href       if href!=None:         self.base_href=href            def get_hrefs(self):     return self.hrefs        def get_base_href(self):     return self.base_href   def get_value_from_tuple_list(self,target_key,list):     for (key,valuein list:       if key==target_key:         return value     return None      def clear(self):       self.hrefs=[]       self.base_href="" class JayWalker():   browsable_files =[]   valid_links = {}   errors = {}   processed_urls =[]   depth 0   file_links = {}   current_file_identifier=""   webpage_extensions = ["htm","html","php","asp","jsp","py",""]   def __init__(self,options):     self.options=options        def get_extension(self,str):     #remove the start_url from the string     s_string str.replace(self.options.start_url,"")     splits s_string.split(".")     if len(splits)>1:       return splits[-1]     else:       return ""          def walk(self):     #process the start_url     self.process_url(self.options.start_url)          '''we are done, what sort of output should there be?'''     #does the user want sitmap data?     if self.options.sitemap:       sm_text="<?xml version=\"1.0\" encoding=\"UTF-8\"?>"       sm_text+="<urlset xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\">\n"       if len(self.browsable_files)>0:         for in self.browsable_files:           sm_text+="\t<url>\n"           sm_text+="\t\t<loc>%s</loc>\n" % (f           sm_text+="\t</url>\n"         sm_text+="</urlset>\n"     if self.options.outdir !=None:       dir self.options.outdir       if not os.path.exists(dir):         os.makedirs(dir)       #make the errors       errors_path os.path.join(dir,"errors.txt")       file open(errors_path,"w")       if len(self.errors)>0:         for key in self.errors.keys():           file.writekey +"\n")           for page in self.errors[key]:             file.write"\t%s\n" % ( page ) )       else:         file.write("no errors")       file.close()       #make the errors       links_path os.path.join(dir,"links.txt")       file open(links_path,"w")       if len(self.valid_links)>0:         for key in self.valid_links.keys():           file.writekey +"\n")           for page in self.valid_links[key]:             file.write"\t%s\n" % ( page ) )       else:         file.write("no links")       file.close()       #make the browsable files       browsables_path os.path.join(dir,"browsables.txt")       file open(browsables_path,"w")       if len(self.browsable_files)>0:         for in self.browsable_files:           file.write("\t%s\n" % (f) )       else:         file.write("no browsables")       file.close()       #are we making a sitemap?       if self.options.sitemap:         sitemap_path os.path.join(dir,"sitemap.xml")         file open(sitemap_path,"w")         file.write(sm_text)         file.close()     else:       print "--ERRORS--"       if len(self.errors)>0:         for key in self.errors.keys():           print key           for page in self.errors[key]:             print "    %s" % ( page )       else:         print "no errors"              print "--VALID LINKS--"       if len(self.valid_links)>0:         for key in self.valid_links.keys():           print key           for page in self.valid_links[key]:             print "    %s" % ( page )       else:         print "no valid links"              print "--BROWSABLE FILES--"        for in self.browsable_files:         print "    "+f              if self.options.sitemap:         print "--SITEMAP--"         print sm_text        def process_url(self,url,parent="root",depth=0):          #add the url to our processed list     self.processed_urls.append(url)          self.file_links[url]=[]     if self.options.be_verbose:       print "processing "+url          #is this a mailto?     if url.startswith("mailto:"or url.startswith("MAILTO:"):       if self.options.be_verbose:         print "found mailto "+url       #todo: record the mailto       return          #is this a link to an anchor?     if url.startswith("#"):       if self.options.be_verbose:         print "found anchor link "+url       #todo: record the mailto       return          #get a handle for the opened url file     try:       fh urllib.urlopen(url)       #what code did we get?       code intfh.getcode() )       if code>400:         #record this error         if not self.errors.has_keyparent ):           self.errors[parent] = []         self.errors[parent].append(url)                  if self.options.be_verbose:           print "error %d" % ( code )         return       else:         #record this good link         if not self.valid_links.has_keyparent ):           self.valid_links[parent] = []         self.valid_links[parent].append(url)         if self.options.be_verbose:           print code              #check the file extension       ext self.get_extension(url)       if not ext in self.webpage_extensions and ext!="":         if self.options.be_verbose:           print "based on extension, we won't parse this file"         return              #check the content type, we only want html       file_info fh.info()       content_type file_info.getheader("Content-type")       if content_type.endswith("html"):         #this is good         pass       else:         if self.options.be_verbose:           print "bad content type: "+content_type         return       #this is a browsable file       self.browsable_files.append(url)         #read the text from the file       file_text ""       readfile True       while(readfile):         temptext fh.read(1028)         file_text+=temptext         if temptext==None or temptext =="":           readfile False       fh.close()       if self.options.be_verbose:         print "reading "+url              Parser(self.options.be_verbose)       p.feed(file_text)              self.file_links[url] = p.get_hrefs()       base_href=p.get_base_href()            except Exception as inst:       print type(inst)     # the exception instance       print inst.args      # arguments stored in .args       print inst           # __str__ allows args to printed directly       if self.options.be_verbose:         print url+" is not a valid file"       if not self.errors.has_key(parent):         self.errors[parent] = []       self.errors[parent].append(url)       return          if self.options.be_verbose:       if len(self.file_links[url])>0:         print "--unique links--"         for link in self.file_links[url]:           print "  "+link       print "processed "+url          #loop through the links     for link in self.file_links[url]:       parse_link=None;       # if the url isn't absolute       #TODO: find a better way to process anchored links       if re.match"^[A-Za-z]+:",link)==None and not link.startswith("#"):         # is there a base href?         if base_href!=None and base_href!="":           parse_link self.get_absolute_url(link,base_href,True)         else:           parse_link self.get_absolute_url(link,url)                else:         #todo: check if this is an Absolute path on the site         pass                if parse_link!=None:         if not parse_link in self.processed_urls:           print parse_link           self.process_url(parse_link,url)        def get_absolute_url(self,url,parent,parent_is_base_href=False):     #determine the parent's directory     bits parent.split("/")     if not parent_is_base_href and parent.count("/")>2:       del(bits[-1])     #how may ../ are in the url?     up_count url.count("../")     #remove that many dirs from the parent bits     for in range(up_count):       print x       sys.exit()       del(bits[-1])     #remove the ../ from the url     url.replace("../","")     parent_root "/".join(bits)          if url.startswith("/") :       return_url parent_root+url     else:       return_url parent_root+"/"+url     return return_url           class Application:   def __init__(self):     #we need an option parser     self.opt_parser OptionParser()     #what do we need from the user?     self.opt_parser.add_option("-u""--url"dest="start_url",       help="The URL to start processing"metavar="URL")     self.opt_parser.add_option("-o""--out"dest="outdir",       help="save output to a specific directory"metavar="dir")     self.opt_parser.add_option("-s""--sitemap",                   action="store_true"dest="sitemap"default=False,                   help="include generic sitemap data in the output")     self.opt_parser.add_option("-v""--verbose",                   action="store_true"dest="be_verbose"default=False,                   help="display what is going on")        def run(self):     #parse the options     (optionsargs) = self.opt_parser.parse_args()     #if there is no url,we quit     if options.start_url==None:       print "ERROR: You did not specify a URL to process"       print ""       sys.exit()     #we need a JayWalker     jw JayWalker(options)     jw.walk() if __name__=="__main__":   app Application()   app.run()

On my computer this file is named "walk.py", and to make a sitemap for my website I run
./walk.py -u http://www.jezra.net -s -o output
The command makes a directory named "output" and creates files with information about my site as well as a sitemap file named "sitemap.xml", which I then upload to my website.

Well there you have it. Copy, edit, learn, or ignore.

Now quit reading, and go crawl with Python.
Comments
2010-03-15 James:
Nice idea. I'm going to take this and build a little bit on it. Interested in feedback?
2010-03-16 jezra:
James, since the code does what I need it to do, I'm very interested in how you change the code to do what you need it to do.
Name:
not required
Email:
not required (will not be displayed)
Website:
not required (will link your name to your site)
Comment:
required
Please do not post HTML code or bbcode unless you want it to show up as code in your post. (or if you are a blog spammer, in which case, you probably aren't reading this anyway).
Prove you are human by solving a math problem! I'm sorry, but due to an increase of blog spam, I've had to implement a CAPTCHA.
Problem:
9 minus 3
Answer:
required
  • Tags:
  • Python
subscribe
 
2016
2015
2014
2013
2012
2011
2010
December
November
October
September
August
July
June
May
April
March
February
January
2009
December
November
October
September
August
July
June
May
April
March
February
January
2008