subscribe
Tags:
 
2012
2011
2010
December
November
October
September
August
July
June
May
April
March
February
January
2009
December
November
October
September
August
July
June
May
April
March
February
January
2008
2010-03-15
What I wanted was a simple way to generate a very basic sitemap file that would conform to the Sitemap protocol. Fortunately, I couldn't find something to do it for me so I had to write a python application to do it for me. But wait! Why stop there?

Since the application is basically a site crawler, I might as well add some feature bloat by checking for link errors and gathering some fairly useless information about the crawled site. To make this possible I utilized the OptionParser class from the optparse library so that I could pass in various command-line arguments to affect the way the app outputs data and to choose which URL to parse.

Without further ado... enter the Python:
#!/usr/bin/env python import sys,urllib,re,os from optparse import OptionParser from HTMLParser import HTMLParser class Parser(HTMLParser):   def __init__(self,verbose=False):     HTMLParser.__init__(self)     self.clear()     self.verbose verbose   def handle_starttag(selftagattrs):     # we only care about 'a' tags     if tag=="a":       href self.get_value_from_tuple_list("href",attrs)       if href!=None:         if self.verbose:           print "found href: "+href         if not href in self.hrefs:           self.hrefs.appendhref )              elif tag=="base":       href self.get_value_from_tuple_list("href",attrs)       if self.verbose:           print "found base href: "+href       if href!=None:         self.base_href=href            def get_hrefs(self):     return self.hrefs        def get_base_href(self):     return self.base_href   def get_value_from_tuple_list(self,target_key,list):     for (key,valuein list:       if key==target_key:         return value     return None      def clear(self):       self.hrefs=[]       self.base_href="" class JayWalker():   browsable_files =[]   valid_links = {}   errors = {}   processed_urls =[]   depth 0   file_links = {}   current_file_identifier=""   webpage_extensions = ["htm","html","php","asp","jsp","py",""]   def __init__(self,options):     self.options=options        def get_extension(self,str):     #remove the start_url from the string     s_string str.replace(self.options.start_url,"")     splits s_string.split(".")     if len(splits)>1:       return splits[-1]     else:       return ""          def walk(self):     #process the start_url     self.process_url(self.options.start_url)          '''we are done, what sort of output should there be?'''     #does the user want sitmap data?     if self.options.sitemap:       sm_text="<?xml version=\"1.0\" encoding=\"UTF-8\"?>"       sm_text+="<urlset xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\">\n"       if len(self.browsable_files)>0:         for in self.browsable_files:           sm_text+="\t<url>\n"           sm_text+="\t\t<loc>%s</loc>\n" % (f           sm_text+="\t</url>\n"         sm_text+="</urlset>\n"     if self.options.outdir !=None:       dir self.options.outdir       if not os.path.exists(dir):         os.makedirs(dir)       #make the errors       errors_path os.path.join(dir,"errors.txt")       file open(errors_path,"w")       if len(self.errors)>0:         for key in self.errors.keys():           file.writekey +"\n")           for page in self.errors[key]:             file.write"\t%s\n" % ( page ) )       else:         file.write("no errors")       file.close()       #make the errors       links_path os.path.join(dir,"links.txt")       file open(links_path,"w")       if len(self.valid_links)>0:         for key in self.valid_links.keys():           file.writekey +"\n")           for page in self.valid_links[key]:             file.write"\t%s\n" % ( page ) )       else:         file.write("no links")       file.close()       #make the browsable files       browsables_path os.path.join(dir,"browsables.txt")       file open(browsables_path,"w")       if len(self.browsable_files)>0:         for in self.browsable_files:           file.write("\t%s\n" % (f) )       else:         file.write("no browsables")       file.close()       #are we making a sitemap?       if self.options.sitemap:         sitemap_path os.path.join(dir,"sitemap.xml")         file open(sitemap_path,"w")         file.write(sm_text)         file.close()     else:       print "--ERRORS--"       if len(self.errors)>0:         for key in self.errors.keys():           print key           for page in self.errors[key]:             print "    %s" % ( page )       else:         print "no errors"              print "--VALID LINKS--"       if len(self.valid_links)>0:         for key in self.valid_links.keys():           print key           for page in self.valid_links[key]:             print "    %s" % ( page )       else:         print "no valid links"              print "--BROWSABLE FILES--"        for in self.browsable_files:         print "    "+f              if self.options.sitemap:         print "--SITEMAP--"         print sm_text        def process_url(self,url,parent="root",depth=0):          #add the url to our processed list     self.processed_urls.append(url)          self.file_links[url]=[]     if self.options.be_verbose:       print "processing "+url          #is this a mailto?     if url.startswith("mailto:"or url.startswith("MAILTO:"):       if self.options.be_verbose:         print "found mailto "+url       #todo: record the mailto       return          #is this a link to an anchor?     if url.startswith("#"):       if self.options.be_verbose:         print "found anchor link "+url       #todo: record the mailto       return          #get a handle for the opened url file     try:       fh urllib.urlopen(url)       #what code did we get?       code intfh.getcode() )       if code>400:         #record this error         if not self.errors.has_keyparent ):           self.errors[parent] = []         self.errors[parent].append(url)                  if self.options.be_verbose:           print "error %d" % ( code )         return       else:         #record this good link         if not self.valid_links.has_keyparent ):           self.valid_links[parent] = []         self.valid_links[parent].append(url)         if self.options.be_verbose:           print code              #check the file extension       ext self.get_extension(url)       if not ext in self.webpage_extensions and ext!="":         if self.options.be_verbose:           print "based on extension, we won't parse this file"         return              #check the content type, we only want html       file_info fh.info()       content_type file_info.getheader("Content-type")       if content_type.endswith("html"):         #this is good         pass       else:         if self.options.be_verbose:           print "bad content type: "+content_type         return       #this is a browsable file       self.browsable_files.append(url)         #read the text from the file       file_text ""       readfile True       while(readfile):         temptext fh.read(1028)         file_text+=temptext         if temptext==None or temptext =="":           readfile False       fh.close()       if self.options.be_verbose:         print "reading "+url              Parser(self.options.be_verbose)       p.feed(file_text)              self.file_links[url] = p.get_hrefs()       base_href=p.get_base_href()            except Exception as inst:       print type(inst)     # the exception instance       print inst.args      # arguments stored in .args       print inst           # __str__ allows args to printed directly       if self.options.be_verbose:         print url+" is not a valid file"       if not self.errors.has_key(parent):         self.errors[parent] = []       self.errors[parent].append(url)       return          if self.options.be_verbose:       if len(self.file_links[url])>0:         print "--unique links--"         for link in self.file_links[url]:           print "  "+link       print "processed "+url          #loop through the links     for link in self.file_links[url]:       parse_link=None;       # if the url isn't absolute       #TODO: find a better way to process anchored links       if re.match"^[A-Za-z]+:",link)==None and not link.startswith("#"):         # is there a base href?         if base_href!=None and base_href!="":           parse_link self.get_absolute_url(link,base_href,True)         else:           parse_link self.get_absolute_url(link,url)                else:         #todo: check if this is an Absolute path on the site         pass                if parse_link!=None:         if not parse_link in self.processed_urls:           print parse_link           self.process_url(parse_link,url)        def get_absolute_url(self,url,parent,parent_is_base_href=False):     #determine the parent's directory     bits parent.split("/")     if not parent_is_base_href and parent.count("/")>2:       del(bits[-1])     #how may ../ are in the url?     up_count url.count("../")     #remove that many dirs from the parent bits     for in range(up_count):       print x       sys.exit()       del(bits[-1])     #remove the ../ from the url     url.replace("../","")     parent_root "/".join(bits)          if url.startswith("/") :       return_url parent_root+url     else:       return_url parent_root+"/"+url     return return_url           class Application:   def __init__(self):     #we need an option parser     self.opt_parser OptionParser()     #what do we need from the user?     self.opt_parser.add_option("-u""--url"dest="start_url",       help="The URL to start processing"metavar="URL")     self.opt_parser.add_option("-o""--out"dest="outdir",       help="save output to a specific directory"metavar="dir")     self.opt_parser.add_option("-s""--sitemap",                   action="store_true"dest="sitemap"default=False,                   help="include generic sitemap data in the output")     self.opt_parser.add_option("-v""--verbose",                   action="store_true"dest="be_verbose"default=False,                   help="display what is going on")        def run(self):     #parse the options     (optionsargs) = self.opt_parser.parse_args()     #if there is no url,we quit     if options.start_url==None:       print "ERROR: You did not specify a URL to process"       print ""       sys.exit()     #we need a JayWalker     jw JayWalker(options)     jw.walk() if __name__=="__main__":   app Application()   app.run()

On my computer this file is named "walk.py", and to make a sitemap for my website I run
./walk.py -u http://www.jezra.net -s -o output
The command makes a directory named "output" and creates files with information about my site as well as a sitemap file named "sitemap.xml", which I then upload to my website.

Well there you have it. Copy, edit, learn, or ignore.

Now quit reading, and go crawl with Python.
Comments
2010-03-15 James:
Nice idea. I'm going to take this and build a little bit on it. Interested in feedback?
2010-03-16 jezra:
James, since the code does what I need it to do, I'm very interested in how you change the code to do what you need it to do.
Name:
not required
Email:
not required (will not be displayed)
Website:
not required (will link your name to your site)
Comment:
required
Please do not post HTML code or bbcode unless you want it to show up as code in your post. (or if you are a blog spammer, in which case, you probably aren't reading this anyway).
Prove you are human by solving a math problem! I'm sorry, but due to an increase of blog spam, I've had to implement a CAPTCHA.
Problem:
6 plus 9
Answer:
required
  • Tags:
  • Python