#!/usr/bin/python """The Crawler module for EIAOHarvestMan2 Including new sampling algorithm """ # Copyright 2005, 2006 EIAO Consoritum # This program is distributed under the terms of the GNU General # Public License. # # This file is part of the European Internet Accessibility Observatory # (EIAO) # # EIAO is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # EIAO is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with EIAO; if not, write to the Free Software # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, # MA 02110-1301 USA import sys reload(sys) sys.setdefaultencoding('utf-8') #http://trac.edgewall.org/ticket/5628 - Should fix all unicodedecodeerrors we have import re import SOAPpy import urlparse import urllib import sc import os import urlrep import logging #Note that only importing eiaotime here should be safe for used from this script, e.g. harvestman, url repository, etc. from eiaotime import * #import changefreq import psycopg2 import random import time import timeprofile import SOAPpy import urlparse import os import sc from socket import gethostname; from harvestman import * from harvestmanklass import EIAOHarvestMan #import timeoutsocket # Created by Anand on May 15 07 by copying crawler.py __author__ = "Anand B Pillai" __version__ = "0.1" __maintainer__ = "Terje Gjoesaeter, Morten Goodwin Olsen" profiler = timeprofile.timeprofile() import string def onlyascii(u): if u in string.ascii_letters: return u else: return '_' def sortstringlen(x,y): if random.uniform(0,1)<0.75: return cmp(len(x),len(y)) else: return cmp(len(y),len(x)) class Crawler: """The main Crawler class """ def __init__(self, database,site = None): """initialise the class Keyword arguments: database -- Database to connect to """ self.sco=sc.SystemConfiguration() #timeoutsocket.setDefaultSocketTimeout(None) self.siteurlserver = SOAPpy.SOAPProxy(urllib.unquote(self.sco.siteurlserver)) self.urlservertuple = self.siteurlserver.getSiteURL() #timeoutsocket.setDefaultSocketTimeout(int(self.sco.pagetimeout)) self.site,self.testrun,self.etlserver,self.schedulecount,dummy,dummy = (self.urlservertuple) if site: self.site=site #self.site = 'http://portal.tugraz.at' #self.site = 'http://www.handelsregister-sh.de' #self.site = 'http://www.polizei.de/' #self.site = 'http://ec.europa.eu/cip/' #self.site = 'http://www.sadirac.com/' #self.testrun = '1239' #self.site = 'http://www.drk-berlin.de/' print 'Crawling site:',self.site,'to DB:',database pid = os.getpid() #Hack to check in DW con = psycopg2.connect(user=self.sco.dwuser, database=self.sco.dwdatabase, password=self.sco.dwpassword,host=self.sco.dwhost) cur = con.cursor() testrunid = self.testrun thissite = self.site if thissite.startswith('http://'): thissite = thissite[7:] elif thissite.startswith('https://'): thissite = thissite[8:] cur.execute("select True from datastaging.site natural join datastaging.resource natural join datastaging.resourceversion where testrunid=%(testrunid)s and site=%(thissite)s;",locals()) #self.shouldvisit = True if self.sco.forcedwcheck: try: self.shouldvisit = not cur.fetchall()[0][0] except IndexError: self.shouldvisit = True if self.shouldvisit: con = psycopg2.connect(user='eiaourlrep', database='eiaourlrep', password='eiaourlrep') cur = con.cursor() testrun = self.testrun cur.execute("select count(distinct scenarioid) from page where domain=%(thissite)s and testrunid=%(testrun)s;",locals()) try: num = int(cur.fetchall()[0][0]) except: self.shouldvisit = True if num>=self.sco.pagestodownload: samplingserver = SOAPpy.SOAPProxy(self.sco.samplingserver) host = gethostname() samplingserver.loadSample('deprecated',str(self.site),str(database),str(self.testrun),str(self.pid),str(host),self.sco.etlserver) self.shouldvisit = False #Stop of hack self.hostname=gethostname() #if self.site.startswith('http'): # tempsite = urlparse.urlsplit(self.site)[1] self.siteurlserver.keepAlive(thissite,pid,self.hostname) #open(str(database)+'_to_crawl_','w').write(self.site) self.sco.crawlerrdfmodel = 'notmater' self.crawlerrdfmodel = 'nomater' self.crawlerconfigtemplate = self.sco.crawlerconfigtemplate self.webcachedirectory = self.sco.webcachedirectory self.configdirectory = self.sco.configdirectory #self.wamserverstrings = [info['location'] for info in self.sco.products if info['productname'] in ['WAMServer','ImergoWAMServer']] self.database = database # Get an iterator from URL Repository self.urlrepo=urlrep.URLRepUtils3(sco=self.sco) self.urlrepo.currentTestRun = self.testrun self.pid = os.getpid() #initialisation finished def startCrawling(self): """This function performs the necessary steps to complete a testrun; first writing start of testrun, then do the site selection and the crawling of the sites, and finally writing the last part of the testrun """ if self.shouldvisit: self.siteSelection(self.urlrepo) def siteSelection(self, urlrepo): """In this function, we get a list of sites to be crawled, create config files and perform the crawl for each site. Keyword arguments: urlrepo -- URL repository """ #testrun=str(self.urlrepo.getCurrentTestRun(retint=True)) #Test for connecting to the URL repository #1. siteurl as string #2. testrun as integer #3. rdfmodel as string #4. etl-server with port as string #5. number of failing results for this model #6. timeoutcounter (internal use, candidate for delete). if self.site: f = open(self.sco.loglocation+'/crawler.log','a') t = time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.localtime()) f.write(t + ': Crawling site: '+self.site.strip() + ' to DB '+str(self.database)+'\n') f.close() try: conf,location=self.mkConfig(self.site,self.testrun) except IOError,e: f = open(self.sco.loglocation+'/crawler.log','a') t = time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.localtime()) f.write(t + str(e) + '\n') f.close() #Error writing the configuration file else: print "Do the crawl for " + self.site profiler.mark('crawl') pre = time.time() self.doCrawl(conf) site = urlparse.urlsplit(self.site)[1] sitefilename = ''.join([onlyascii(i) for i in str(site)]) #Note the following is a hack to be able to perform performance evaluations. f = open(self.sco.loglocation+'/performance/'+str(sitefilename),'a') f.write('Crawl duration:' + str(time.time()-pre)+'\n') f.close() print 'End of Crawl' f = open(self.sco.loglocation+'/time.log','a') f.write('Crawling site: '+self.site.strip()+' ' +str(profiler.elapsed('crawl'))+'\n') f.close() pre = time.time() #:self.scp.stop() f = open(self.sco.loglocation+'/time.log','a') f.write('Closing scenario collection pool: '+str(time.time()-pre)+'\n') #All threads must be stopped before the N-dictionary can be stored f.close() self.sco.stop() domain = urlparse.urlparse(self.site)[1] try: self.urlrepo.cur.execute('select count(distinct scenarioid) from page where domain=%(domain)s;',locals()) numscenarios = int(self.urlrepo.cur.fetchall()[0][0]) except: print 'Error getting scenario ids' numscenarios = 0 f = open(self.sco.loglocation+'/crawler.log','a') f.write(t + ': Finished crawling site:'+str(self.site)+' to db '+str(self.database)+' Number of available scenarios reached from the crawler '+str(numscenarios)+'\n') f.close() samplingserver = SOAPpy.SOAPProxy(self.sco.samplingserver) host = gethostname() #samplingserver.loadSample(str(location),str(self.site),str(self.database),str(self.testrun),str(self.pid),str(host),self.sco.etlserver) self.urlrepo.close() #This site has been crawled without hang and is therefore excluded from timeout #if not self.exceedmemory: #if 1==1: # self.siteurlserver.finishedSiteURL(str(self.site),str(host)) #if not self.exceedmemory: self.urlrepo.updateExhaustiveScan(self.site,self.testrun,self.exhaustivescan) if self.shouldvisit: samplingserver.loadSample(str(location),str(self.site),str(self.database),str(self.testrun),str(self.pid),str(host),self.sco.etlserver) self.siteurlserver.finishedSiteURL(str(self.site),str(host)) else: print 'Ignoring site. self.shouldvisit set to false' #else: # self.siteurlserver.putSiteURL(str(self.site)) def mkConfig(self, siteurl, testrun): """Create a configuration file for the harvestman project One config file per site, overwrite the file each crawl Naming: config- The structure of the web cache is now as follows: /storedfiles//// Keyword arguments: siteurl - url of the site to be crawled testrun - name of the testrun """ #Read the template templatefile = open(self.crawlerconfigtemplate) templatecontent = templatefile.readlines() templatefile.close() # Find values for variables in config fileself.rw,self.rdfg # Project name projectname=siteurl projectname= re.sub("http://|https://", "", projectname) projectname= re.sub("/", "_", projectname).strip('_') # Construct new basedir basedir=self.webcachedirectory+self.crawlerrdfmodel+"/"+testrun+"/"+str(random.randint(1000,9999))+"/" #substitute values in config content #The following is to support domains both pure domain ad by URLs #thissiteurl = siteurl.lstrip('http://').rstrip('/') if siteurl.startswith('http://'): thissiteurl = siteurl[7:] elif siteurl.startswith('https://'): thissiteurl = siteurl[8:] thissiteurl = thissiteurl.rstrip('/') tempsiteurl = self.urlrepo.getAllStartURLsFromSite(thissiteurl) if tempsiteurl: tempsiteurl.sort(sortstringlen) siteurl = urllib.quote(tempsiteurl[-1],safe=':/') lastmilehack = self.sco.lastmilehack templatecontent=self.replaceString(templatecontent, "##lastmilehack##", lastmilehack) templatecontent=self.replaceString(templatecontent, "##url##", siteurl) templatecontent=self.replaceString(templatecontent, "##name##", projectname) templatecontent=self.replaceString(templatecontent, "##sitereference##", thissiteurl) templatecontent=self.replaceString(templatecontent, "##baseDir##", basedir) templatecontent=self.replaceString(templatecontent, "##proxyServer##", urlparse.urlparse(urllib.unquote(self.sco.webproxy))[1]) templatecontent=self.replaceString(templatecontent, "##proxyUser##", self.sco.webproxyuser) templatecontent=self.replaceString(templatecontent, "##proxyPassword##", self.sco.webproxypassword) templatecontent=self.replaceString(templatecontent, "##proxyPort##", urlparse.urlparse(urllib.unquote(self.sco.webproxy))[2]) #Construct config file name configfilename=self.configdirectory+"config-"+projectname+".xml" configfile=open(configfilename, 'w') for configline in templatecontent: configfile.write(configline) configfile.close return (configfilename,basedir) def doCrawl(self, configfile): """Start a HarvestMan crawl with the specified config and log file Keyword arguments: configfile - name of the config file """ prepare(configfile) crawler = EIAOHarvestMan() print 'Config:',configfile #Give various needed instances to crawler here crawler.setInstances(self.urlrepo,self.siteurlserver) crawler.main() #Flushing outputs to help cron job watchdogs see if there is activity try: self.exceedmemory = crawler._cfg.exceedmemory except (NameError,AttributeError): self.exceedmemory = False if crawler._cfg.exhaustivescan: self.exhaustivescan = False else: self.exhaustivescan = True sys.stdout.flush() sys.stderr.flush() def replaceString(self, content, stringtoreplace, replacestring): """Replaces an occurence of stringtoreplace with replacestring in mulitlined content content - multilined content to have string replaced stringtoreplace - the string which is to be replaced replacestring - the new string Returns the modified string """ return [s.replace(stringtoreplace, replacestring) for s in content] if __name__ == "__main__": print 'Starting crawl' if not '-d' in sys.argv: print "Error. Database not specified" print "Usage:" print "python %s -d databasename" % (sys.argv[0]) sys.exit() dbname = sys.argv[sys.argv.index('-d')+1] print 'crawling with database:',dbname if '-s' in sys.argv: cr=Crawler(database=dbname,site=sys.argv[sys.argv.index('-s')+1]) else: cr=Crawler(database=dbname) cr.startCrawling() cr.sco.stop() sys.stdout.flush() if 'memanalysis' in sys.argv: time.sleep(60)#Making sure all logs are emptied within this time... Most likely a much better way to do this... import os from sizer import scanner from sizer import annotate from sizer import formatting import xmlrpclib from sizer import graph import sys from sizer import operations objs = scanner.Objects() #mods = annotate.simplegroupby(objs, classes=True) #formatting.printsizes(mods) formatting.printsizes(objs, count=100) #prestdout = sys.stdout #sys.stdout = open(cr.sco.loglocation + 'memory.log','w') #somegraph = graph.makegraph(objs, count = 15, proportional = True) #os.system('cp %(somegraph)s ./graph.ps'%locals()) nostr = operations.fix(operations.filterouttype(objs, str)) formatting.printsizesop(operations.bytype(objs), threshold = 1000) lists = operations.filtertype(objs, list) formatting.printsizesop(operations.bysize(lists)) a = [(i.size,i.type,i) for i in objs.values() if i.type==type({})] a.sort() a = a[-50:] print 'Dicionaries:' for size,type,i in a: print i print ' type:',type print ' size:',size try: print ' parent:',i.parent except: pass try: print ' parents:',i.parents except: pass print ' children:'#,i.children for item in i.children: print ' ',item #sys.stdout = prestdout