2010-03-15
What I wanted was a simple way to generate a very basic sitemap file that would conform to the Sitemap protocol. Fortunately, I couldn't find something to do it for me so I had to write a python application to do it for me. But wait! Why stop there?
Since the application is basically a site crawler, I might as well add some feature bloat by checking for link errors and gathering some fairly useless information about the crawled site. To make this possible I utilized the OptionParser class from the optparse library so that I could pass in various command-line arguments to affect the way the app outputs data and to choose which URL to parse.
Without further ado... enter the Python:
On my computer this file is named "walk.py", and to make a sitemap for my website I run
Well there you have it. Copy, edit, learn, or ignore.
Now quit reading, and go crawl with Python.
Since the application is basically a site crawler, I might as well add some feature bloat by checking for link errors and gathering some fairly useless information about the crawled site. To make this possible I utilized the OptionParser class from the optparse library so that I could pass in various command-line arguments to affect the way the app outputs data and to choose which URL to parse.
Without further ado... enter the Python:
#!/usr/bin/env python import sys,urllib,re,os from optparse import OptionParser from HTMLParser import HTMLParser class Parser(HTMLParser): def __init__(self,verbose=False): HTMLParser.__init__(self) self.clear() self.verbose = verbose def handle_starttag(self, tag, attrs): # we only care about 'a' tags if tag=="a": href = self.get_value_from_tuple_list("href",attrs) if href!=None: if self.verbose: print "found href: "+href if not href in self.hrefs: self.hrefs.append( href ) elif tag=="base": href = self.get_value_from_tuple_list("href",attrs) if self.verbose: print "found base href: "+href if href!=None: self.base_href=href def get_hrefs(self): return self.hrefs def get_base_href(self): return self.base_href def get_value_from_tuple_list(self,target_key,list): for (key,value) in list: if key==target_key: return value return None def clear(self): self.hrefs=[] self.base_href="" class JayWalker(): browsable_files =[] valid_links = {} errors = {} processed_urls =[] depth = 0 file_links = {} current_file_identifier="" webpage_extensions = ["htm","html","php","asp","jsp","py",""] def __init__(self,options): self.options=options def get_extension(self,str): #remove the start_url from the string s_string = str.replace(self.options.start_url,"") splits = s_string.split(".") if len(splits)>1: return splits[-1] else: return "" def walk(self): #process the start_url self.process_url(self.options.start_url) '''we are done, what sort of output should there be?''' #does the user want sitmap data? if self.options.sitemap: sm_text="<?xml version=\"1.0\" encoding=\"UTF-8\"?>" sm_text+="<urlset xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\">\n" if len(self.browsable_files)>0: for f in self.browsable_files: sm_text+="\t<url>\n" sm_text+="\t\t<loc>%s</loc>\n" % (f) sm_text+="\t</url>\n" sm_text+="</urlset>\n" if self.options.outdir !=None: dir = self.options.outdir if not os.path.exists(dir): os.makedirs(dir) #make the errors errors_path = os.path.join(dir,"errors.txt") file = open(errors_path,"w") if len(self.errors)>0: for key in self.errors.keys(): file.write( key +"\n") for page in self.errors[key]: file.write( "\t%s\n" % ( page ) ) else: file.write("no errors") file.close() #make the errors links_path = os.path.join(dir,"links.txt") file = open(links_path,"w") if len(self.valid_links)>0: for key in self.valid_links.keys(): file.write( key +"\n") for page in self.valid_links[key]: file.write( "\t%s\n" % ( page ) ) else: file.write("no links") file.close() #make the browsable files browsables_path = os.path.join(dir,"browsables.txt") file = open(browsables_path,"w") if len(self.browsable_files)>0: for f in self.browsable_files: file.write("\t%s\n" % (f) ) else: file.write("no browsables") file.close() #are we making a sitemap? if self.options.sitemap: sitemap_path = os.path.join(dir,"sitemap.xml") file = open(sitemap_path,"w") file.write(sm_text) file.close() else: print "--ERRORS--" if len(self.errors)>0: for key in self.errors.keys(): print key for page in self.errors[key]: print " %s" % ( page ) else: print "no errors" print "--VALID LINKS--" if len(self.valid_links)>0: for key in self.valid_links.keys(): print key for page in self.valid_links[key]: print " %s" % ( page ) else: print "no valid links" print "--BROWSABLE FILES--" for f in self.browsable_files: print " "+f if self.options.sitemap: print "--SITEMAP--" print sm_text def process_url(self,url,parent="root",depth=0): #add the url to our processed list self.processed_urls.append(url) self.file_links[url]=[] if self.options.be_verbose: print "processing "+url #is this a mailto? if url.startswith("mailto:") or url.startswith("MAILTO:"): if self.options.be_verbose: print "found mailto "+url #todo: record the mailto return #is this a link to an anchor? if url.startswith("#"): if self.options.be_verbose: print "found anchor link "+url #todo: record the mailto return #get a handle for the opened url file try: fh = urllib.urlopen(url) #what code did we get? code = int( fh.getcode() ) if code>400: #record this error if not self.errors.has_key( parent ): self.errors[parent] = [] self.errors[parent].append(url) if self.options.be_verbose: print "error %d" % ( code ) return else: #record this good link if not self.valid_links.has_key( parent ): self.valid_links[parent] = [] self.valid_links[parent].append(url) if self.options.be_verbose: print code #check the file extension ext = self.get_extension(url) if not ext in self.webpage_extensions and ext!="": if self.options.be_verbose: print "based on extension, we won't parse this file" return #check the content type, we only want html file_info = fh.info() content_type = file_info.getheader("Content-type") if content_type.endswith("html"): #this is good pass else: if self.options.be_verbose: print "bad content type: "+content_type return #this is a browsable file self.browsable_files.append(url) #read the text from the file file_text = "" readfile = True while(readfile): temptext = fh.read(1028) file_text+=temptext if temptext==None or temptext =="": readfile = False fh.close() if self.options.be_verbose: print "reading "+url p = Parser(self.options.be_verbose) p.feed(file_text) self.file_links[url] = p.get_hrefs() base_href=p.get_base_href() except Exception as inst: print type(inst) # the exception instance print inst.args # arguments stored in .args print inst # __str__ allows args to printed directly if self.options.be_verbose: print url+" is not a valid file" if not self.errors.has_key(parent): self.errors[parent] = [] self.errors[parent].append(url) return if self.options.be_verbose: if len(self.file_links[url])>0: print "--unique links--" for link in self.file_links[url]: print " "+link print "processed "+url #loop through the links for link in self.file_links[url]: parse_link=None; # if the url isn't absolute #TODO: find a better way to process anchored links if re.match( "^[A-Za-z]+:",link)==None and not link.startswith("#"): # is there a base href? if base_href!=None and base_href!="": parse_link = self.get_absolute_url(link,base_href,True) else: parse_link = self.get_absolute_url(link,url) else: #todo: check if this is an Absolute path on the site pass if parse_link!=None: if not parse_link in self.processed_urls: print parse_link self.process_url(parse_link,url) def get_absolute_url(self,url,parent,parent_is_base_href=False): #determine the parent's directory bits = parent.split("/") if not parent_is_base_href and parent.count("/")>2: del(bits[-1]) #how may ../ are in the url? up_count = url.count("../") #remove that many dirs from the parent bits for x in range(up_count): print x sys.exit() del(bits[-1]) #remove the ../ from the url url.replace("../","") parent_root = "/".join(bits) if url.startswith("/") : return_url = parent_root+url else: return_url = parent_root+"/"+url return return_url class Application: def __init__(self): #we need an option parser self.opt_parser = OptionParser() #what do we need from the user? self.opt_parser.add_option("-u", "--url", dest="start_url", help="The URL to start processing", metavar="URL") self.opt_parser.add_option("-o", "--out", dest="outdir", help="save output to a specific directory", metavar="dir") self.opt_parser.add_option("-s", "--sitemap", action="store_true", dest="sitemap", default=False, help="include generic sitemap data in the output") self.opt_parser.add_option("-v", "--verbose", action="store_true", dest="be_verbose", default=False, help="display what is going on") def run(self): #parse the options (options, args) = self.opt_parser.parse_args() #if there is no url,we quit if options.start_url==None: print "ERROR: You did not specify a URL to process" print "" sys.exit() #we need a JayWalker jw = JayWalker(options) jw.walk() if __name__=="__main__": app = Application() app.run()
On my computer this file is named "walk.py", and to make a sitemap for my website I run
The command makes a directory named "output" and creates files with information about my site as well as a sitemap file named "sitemap.xml", which I then upload to my website../walk.py -u http://www.jezra.net -s -o output
Well there you have it. Copy, edit, learn, or ignore.
Now quit reading, and go crawl with Python.
Comments
Nice idea. I'm going to take this and build a little bit on it. Interested in feedback?
James, since the code does what I need it to do, I'm very interested in how you change the code to do what you need it to do.