import time import sys import os import string import formatter import httplib import urllib2 import htmllib import socket import urlparse import do_sql import mysql_tidy debug = 2 sleep_time = 0 CURRENT_REFERER_BASE = '' # Time connections out after 15 seconds # This requires Python 2.3.3 or higher # socket.setdefaulttimeout(80) def parse_url(this_url, base=CURRENT_REFERER_BASE): results = {} results['url'] = this_url result_list = [] base_path = list(urlparse.urlparse(base)).pop(3) result_list = list(urlparse.urlparse(this_url)) for this_key in ['fragment','query','params','path','host','scheme']: results[this_key] = result_list.pop() if results['query'] != '' and string.find(results['path'], results['query']) == -1: results['path'] = results['path'] + '?' + results['query'] if string.find(results['path'], '/') == -1: results['path'] = base_path + '/' + results['path'] if string.find(results['path'], '/') > 0: results['path'] = base_path + '/' + results['path'] return results def add_noscript_param(path): """ Function for adding noscript cgi parameters to URLs to exlcude javascript from the HTML """ if string.find(path,'?') > -1: path = path + '&noscript=1' else: path = path + '?noscript=1' return path def do_head(domain_name, path, debug): """ Function to get an HTTP response code. Use GET not HEAD as HEAD gives a 200 inexplicably for redirects """ path = add_noscript_param(path) # Except don't! Do a GET coz HEAD returns different status codes for redirects sesh = httplib.HTTP(domain_name) sesh.putrequest('GET', path) # Set the name of our user agent sesh.putheader("User-Agent", 'medev.ac.uk Python MySQL Crawler') sesh.putheader('Accept','text/html') sesh.putheader('Accept','application/octet-stream') sesh.putheader('Accept','text/xml') sesh.endheaders() # Get our reply httpcode, httpmsg, headers = sesh.getreply() # Set our results dictionary results = {} results['status'] = httpcode results['message'] = httpmsg headers_tup = headers.items() if headers.has_key('content-type'): results['content_type'] = headers.get('content-type') else: results['content_type'] = 'unknown' results['headers'] = headers_tup # If we are redirected set the new location to results['location'] if results['status'] == 301 or results['status'] == 302 or results['status'] == 303 or results['status'] == 305: results['location'] = headers.getheader('location') if debug ==1: print results return results def do_get(domain_name, path, debug): sesh = httplib.HTTP(domain_name) # Tack on an extra cgi param to exclude javascript crap path = add_noscript_param(path) # Do a head request first to test the response code results = do_head(domain_name,path, debug) # Only proceed if we get a 200 or a 30x if results['status'] != 200: if results.has_key('location'): pass else: if results['status'] == 500: print parse_url(path) return results # If we get redirect call the Location header if results.has_key('location'): path = add_noscript_param(results['location']) sesh.putrequest('GET',path) sesh.putheader("User-Agent", 'medev.ac.uk Python MySQL Crawler') sesh.putheader('Accept','text/html') sesh.putheader('Accept','application/octet-stream') sesh.putheader('Accept','text/xml') sesh.endheaders() httpcode, httpmsg, headers = sesh.getreply() results = {} results['status'] = httpcode results['message'] = httpmsg results['headers'] = headers.items() if headers.has_key('content-type'): results['content_type'] = headers.get('content-type') if debug ==1: print results doc = sesh.getfile() data = doc.read() # If we don't have an HTML doc return a 406 error if string.find(results['content_type'], 'text/html') < 0: results['status'] = 406 if string.find(str(data),'') < 0: results['status'] = 406 else: results['document'] = data #if debug == 1: print results return results def parse_html(this_html, domain_list, debug): link_list = [] results = {} writer = formatter.DumbWriter() format = formatter.AbstractFormatter(writer) parser = htmllib.HTMLParser(format) try: parser.feed(this_html) except: results['status'] = 0 return results parser.close() print print count = 1 for link in parser.anchorlist: for this_domain in domain_list: rel_link = string.replace(link, 'http://' + this_domain, '') print count, ' => ', link link_list.append(rel_link) count = count + 1 results['title'] = parser.title results['base'] = parser.base CURRENT_REFERER_BASE = results['base'] results['status'] = 1 results['link_list'] = link_list results['html'] = this_html return results def capture_parse_results(this_html, domain_list, temp_file, debug): saveout = sys.stdout sys.stdout = open(temp_file, 'w') results = parse_html(this_html, domain_list, debug) sys.stdout = saveout if results['status'] == 0: return results fsock = open(temp_file) results['document'] = fsock.read() fsock.close() return results def remove_index_html(this_url): for this_index in ['index_html','index.html']: this_url = string.replace(this_url, this_index, '') this_url = string.replace(this_url, "\n", '') this_url = string.replace(this_url, '&', '&') #this_url = string.replace(this_url, " ", '+') return this_url def do_proxied_head(this_url, use_proxy, proxy_url): request = urllib2.Request(url=this_url) request.add_header('User-Agent','medev.ac.uk Python MySQL Crawler') if use_proxy == 1: opener = urllib2.build_opener(urllib2.ProxyHandler({'http': proxy_url})) else: opener = urllib2.build_opener() urllib2.install_opener(opener) results = {} try: reply = urllib2.urlopen(this_url) results['status'] = 200 headers = reply.headers.dict results['headers'] = headers if headers.has_key('content-type'): results['content_type'] = headers['content-type'] except urllib2.HTTPError, e: results['status'] = e.code results['content_type'] = 'unknown' results['response_code'] = results['status'] return results def handle_page(start_url, domain_list, temp_file, debug): reply = parse_url(start_url) if debug == 1: print reply reply['path'] = remove_index_html(reply['path']) #reply['path'] = add_noscript_param(reply['path']) if debug == 1: print reply domain_name = reply['host'] from_here = from_this_domain(domain_name, domain_list) # If we've got an external URL return 406 if from_here == 0: #flag_as_external(start_url) return {'status' : 0, 'message' : 'External link', 'response_code' : 406, 'content_type' : 'unknown'} this_url = reply['path'] if reply['query'] != '' and string.find(reply['path'], reply['query']) == -1: this_url = this_url + '?' + reply['query'] #this_url = add_noscript_param(this_url) results = do_get(domain_name, this_url, debug) content_type = results['content_type'] # Return a 0 status unless we have an HTML document if results['content_type'] != 'text/html': return {'status' : 0, 'message' : "\n\nThis is not an HTML resource. " + start_url + ' Content-type returned: ' + str(results['content_type']), 'response_code' : results['status'], 'content_type' : results['content_type']} # Return a 0 unless we have a successful request if results['status'] != 200: return {'status' : 0, 'message' : "\n\nFailed to get resource. " + start_url + ' GET returned: ' + str(results['status']), 'response_code' : results['status'], 'content_type' : results['content_type']} this_html = results['document'] results = capture_parse_results(this_html, domain_list, temp_file, debug) results['content_type'] = content_type if results['status'] == 0: return {'status' : 0, 'message' : 'Parse run failed..'} else: if debug == 1: print results['document'], "\n\n", results['link_list'] return {'status' : 1, 'message' : 'OK', 'response_code' : 200, 'document' : results['document'], 'link_list' : results['link_list'], 'content_type' : results['content_type'], 'html' : results['html'], 'title' : results['title'], 'base' : results['base']} def from_this_domain(host, domain_list): for dom in domain_list: if host == dom: return 1 return 0 def insert_web_site_text(this_url, reply, debug): url_chunks = parse_url(this_url) this_text = reply['document'] this_html = reply['html'] if reply.has_key('content_type'): pass else: reply['content_type'] = 'unknown' sql = "insert into web_site_text(link_url, link_type, response_code, link_scheme, link_host, " sql = sql + "link_path, link_query, link_params, link_fragment, link_content_type, link_text, " sql = sql + "link_html, link_title, link_base_href) " sql = sql + "values('%s','internal','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s')" this_title = mysql_tidy.tidy(reply['title']) sql = sql % (this_url, reply['response_code'], url_chunks['scheme'], url_chunks['host'], url_chunks['path'], url_chunks['query'], url_chunks['params'], url_chunks['fragment'], reply['content_type'], mysql_tidy.tidy(this_text), mysql_tidy.tidy(this_html), this_title, reply['base']) if debug == 1: print sql if debug == 2: print 'title: ' + str(reply['title']) print 'base: ' + str(reply['base']) result = do_sql.do_sql('ltsn01_ng',sql,debug) do_sql.check_status(result, debug) def insert_link_relationships(this_url, link_list, debug): for this_link in link_list: #time.sleep(sleep_time) sql = "insert into referer_refered_link(referer_url, rrl_link_url) values('%s','%s')" sql = sql % (mysql_tidy.tidy(this_url), mysql_tidy.tidy(this_link)) if debug ==1: print sql result = do_sql.do_sql('ltsn01_ng', sql, debug) do_sql.check_status(result, debug) def handle_external(this_url, debug): reply = do_proxied_head(this_url, 1, 'http://wwwcache.ncl.ac.uk:8080') if debug == 1 or debug == 2: print this_url print reply print status = reply['status'] if reply.has_key('content_type'): content_type = reply['content_type'] else: reply['content_type'] = 'unknown' content_type = reply['content_type'] url_chunks = parse_url(this_url) sql = "insert into web_site_text(link_url, link_type, response_code, link_scheme, link_host, " sql = sql + "link_path, link_query, link_params, link_fragment, link_content_type) " sql = sql + "values('%s','external','%s','%s','%s','%s','%s','%s','%s','%s')" sql = sql % (this_url, reply['response_code'], url_chunks['scheme'], url_chunks['host'], url_chunks['path'], url_chunks['query'], url_chunks['params'], url_chunks['fragment'], content_type) if debug == 1: print sql result = do_sql.do_sql('ltsn01_ng',sql,debug) do_sql.check_status(result, debug) def __init__(): result = do_sql.do_sql('ltsn01_ng','delete from web_site_text',debug) do_sql.check_status(result, debug) result = do_sql.do_sql('ltsn01_ng','delete from referer_refered_link',debug) do_sql.check_status(result, debug) # A list of all the URLs on the site total_tally = {} # A list of links to pages external to the site external_links = {} # A list of broken links broken_links = {} # A list of parsed links parsed_tally = {} # A list of mailto links mailto_list = {} # A list of non-html links non_html_links = {} domain_list = ['www.medev.ac.uk','www.ltsn-01.ac.uk'] temp_file = 'mysql_robot.temp' try: start_url = sys.argv[1] except: sys.exit('Usage: python ' + sys.argv[0] + ' ') if debug == 1: print start_url reply = handle_page(start_url, domain_list, temp_file, debug) # 406 indicates external links as well as non-supported MIME types if reply['status'] == 0: sys.exit(reply['message'] + ' ' + str(reply['response_code']) + "\n\n" + str(reply)) else: if debug == 1: print reply['document'] + "\n\n" + str(reply['link_list']) print reply['message'] + ' ' + str(reply['response_code']) + ' ' + reply['content_type'] master_list = reply['link_list'] counter = 0 total_tally[start_url] = 1 parsed_tally[start_url] = reply['response_code'] while len(master_list) > 0: this_url = master_list.pop(0) this_url = remove_index_html(this_url) reply = parse_url(this_url) url_chunks = reply # Remove internal fragment links to avoid confusion further down the chain if reply['fragment'] != '': this_url = string.replace(this_url, '#' + reply['fragment'], '') # If we have no scheme and no host then it must be a relative URL if reply['scheme'] == '' and reply['host'] == '': this_url = 'http://www.medev.ac.uk' + this_url # Skip this URL if it is already in the total tally if total_tally.has_key(this_url): pass # Skip this URL if it is already in the parsed tally elif parsed_tally.has_key(this_url): total_tally[this_url] = 1 # Check to see if we have an external http link URL elif reply['scheme'] == 'http' and from_this_domain(reply['host'], domain_list) == 0: if external_links.has_key(this_url): pass else: if debug == 1 or debug == 2: print 'External link: ' + this_url time.sleep(sleep_time) external_links[this_url] = 1 total_tally[this_url] = 1 # Check for a mailto link elif reply['scheme'] == 'mailto': if mailto_list.has_key(this_url): pass else: mailto_list[this_url] = 1 total_tally[this_url] = 1 if debug == 1 or debug == 2: print 'mailto link: ' + this_url time.sleep(sleep_time) else: if debug == 1 or debug == 2: print str(counter) + ' ' + this_url time.sleep(sleep_time) # Try and get the page reply = handle_page(this_url, domain_list, temp_file, debug) if reply['status'] == 0: total_tally[this_url] = 1 if debug == 1: print reply['message'] if reply['response_code'] != 200: total_tally[this_url] = 1 parsed_tally[this_url] = reply['response_code'] if reply['response_code'] == 406 and reply['content_type'] != 'text/html': non_html_links[this_url] = reply['content_type'] else: broken_links[this_url] = {'code' : reply['response_code'], 'type' : reply['content_type'], 'message' : reply['message']} if debug == 1 or debug == 2: print reply['message'] else: # Do SQL stuff here insert_web_site_text(this_url, reply, debug) insert_link_relationships(this_url, reply['link_list'], debug) for add_me in reply['link_list']: if total_tally.has_key(add_me): pass elif parsed_tally.has_key(add_me): pass else: master_list.append(add_me) total_tally[this_url] = 1 parsed_tally[this_url] = reply['response_code'] counter = counter + 1 #if counter > 10: sys.exit() if debug == 1: print "Total number of URLs: " + str(len(total_tally.keys())) print "Total number of parsed URLs: " + str(len(parsed_tally.keys())) print "Total number of external URLs: " + str(len(external_links.keys())) print "Total number of mailto URLs: " + str(len(mailto_list.keys())) print "Broken links: " print broken_links print "Non-HTML links: " print non_html_links counter = 0 for this_link in external_links.keys(): counter = counter + 1 print counter time.sleep(sleep_time) handle_external(this_link, debug) __init__() #handle_external('http://www.google.com/', debug) #handle_external('http://www.google.com/pooh/', debug)