#! /usr/bin/env python # -*- coding: UTF-8 -*- """The site URL soap server module """ # Copyright 2005, 2006 EIAO Consoritum # This program is distributed under the terms of the GNU General # Public License. # # This file is part of the European Internet Accessibility Observatory # (EIAO) # # EIAO is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # EIAO is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with EIAO; if not, write to the Free Software # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, # MA 02110-1301 USA __owner__ = "Terje Gjosater" __maintainer__ = "Terje Gjosater" __version__ = 0.1 from eiaotime import * import os import pickle import time import random import sys import Queue import SOAPpy import smtplib import psycopg import getopt from threading import Semaphore import syslog import urlparse import urlrep import sc import random import memcache import timeoutsocket timeoutsocket.setDefaultSocketTimeout(None) import urllib import collections sco = sc.SystemConfiguration() forceReload=False # Do not force a reload. Reuse pickled variables. PIDFILE="/var/run/siteurlserver.pid" SITEURL=0 TESTRUN=1 ETLSERVER=2 SCHEDULECOUNT=3 class Log: """Ensure auto-flush after each write""" def __init__(self, f): self.f = f def write(self, s): self.f.write(s) self.f.flush() def flush(self): self.f.flush() log=Log(open(sco.loglocation+'/siteurlserver.log','a')) class SiteURLServer2: """Site URL server class """ def __init__(self, port=8889, siteurltimeout=None, maxtries=3): """initialise the class Keyword arguements: port -- [Optional] Port to host at. 8889 as default site -- [Optional] Timeout for a site. Retrieved from the System Configuration as defeult. maxtries -- [Optional] Number of strikes for a site. 3 as default. """ self.errorMessage="" self.port=port #self.siteurltimeout=siteurltimeout self.maxtries=maxtries self.mc = memcache.Client([sco.memcache], debug=0) self.etlserver = urllib.unquote(sco.etlserver) #if not siteurltimeout: # siteurltimeout = int(sco.siteurlreptimeout) #self.siteurltimeout = siteurltimeout #self.siteurltimeout = 1800000 + 100 #FIXME: Do not hardcode the timeout. Attempt to avoid #506 self.exceedmemory = [] self.urlrepo=urlrep.URLRepUtils3(sco) self.dbnamecounter=random.randint(1000000,9999999) #self.alivecrawlers = [] self.alivecrawlers = {} self.siteurlexists = {} # Dict of dicts for existence test # Indexed by testrun and URL self.minkeepalive = int(sco.maxnotimeout)#10#300 #The variables below can be pickled if not forceReload: try: f = open('/tmp/siteurlqueuecache.dmp','r') siteurllist = pickle.load(f) for url in siteurllist: self.siteurlqueue.put(url) f.close() except: self.printLog('WARNING: No prior used siteurlqueue available') self.siteurlqueue=Queue.Queue() try: f = open('/tmp/timeoutdictcache.dmp','r') self.timeoutdict = pickle.load(f) f.close() except: self.printLog('WARNING: No prior timeoutdict available') self.timeoutdict={} try: f = open('/tmp/testruncache.dmp','r') self.testrunid = pickle.load(f) f.close() except: self.printLog('WARNING: No prior testrun available') self.testrunid=None try: f = open('/tmp/etlservercache.dmp','r') self.etlserver = pickle.load(f) self.etls=SOAPpy.SOAPProxy(self.etlserver) f.close() except: self.printLog('WARNING: No prior etlserver available') self.etlserver=None self.etls=None try: f = open('/tmp/finishedflagcache.dmp','r') self.finishedflag = pickle.load(f) f.close() except: self.printLog('WARNING: No finishedflag available') self.finishedflag=True try: f = open('/tmp/urlstomailcache.dmp','r') self.urlstomail = pickle.load(f) f.close() self.urlstomailtimer = time.time() except: self.printLog('WARNING: No urlstomail available') self.urlstomail=[] self.urlstomailtimer = time.time() else: self.printLog("INFO: Forced reload") self.siteurlqueue=Queue.Queue() self.timeoutdict={} self.testrunid=None self.etlserver = None self.etls=None self.finishedflag = True self.urlstomail=[] self.urlstomailtimer=time.time() os.system("rm /tmp/*.dmp") self.keepaliveMutex=Semaphore(1) self.timeoutdictMutex=Semaphore(1) self.dbnameMutex=Semaphore(1) if self.etlserver: self.etls=SOAPpy.SOAPProxy(self.etlserver) def writeCache(self): """Cache for avoiding stopping the entire crawl if the siteurlserver fails. The Cache should be written regulary and rewritten read from the cache whenever the URL repository has been restarted. """ try: f = open('/tmp/siteurlqueuecache.dmp','w') pickle.dump(list(self.siteurlqueue.queue),f) f.close() except: self.printLog('WARNING: No prior siteurlqueue written') try: f = open('/tmp/timeoutdictcache.dmp','w') pickle.dump(self.timeoutdict,f) f.close() except: self.printLog('WARNING: No prior timeoutdict written') try: f = open('/tmp/testruncache.dmp','w') pickle.dump(self.testrunid,f) f.close() except: self.printLog('WARNING: No prior testrun written') try: f = open('/tmp/etlservercache.dmp','w') pickle.dump(self.etlserver,f) f.close() except: self.printLog('WARNING: No prior etlserver written') try: f = open('/tmp/urlstomailcache.dmp','w') pickle.dump(self.urlstomail,f) f.close() except: self.printLog('WARNING: No prior urlstomail written') def notAlive(self,machine,duration=None): """Function for retrieving which sites are dead Keyword arguments: machine -- machine name e.g. eiao4.hia.no duration -- Duration before a crawler times out. Uses systemconfiguration default if left empty Returns a list of PIDs to be killed for the specified machine Example: >>> #All active crawlers running >>> siteurlserver.notAlive(sco.urlrephost) [] >>> #One crawler not running with pid 1234 >>> siteurlserver.notAlive(sco.urlrephost) [1234] """ #The keepalive list is structured as following: #[(site,pid,machine,timeout),(site,pid,machine,timeout)...] self.printLog('Notalive recieved from machine %(machine)s'%locals()) self.printLog('Number of sites left in current testrun:' + str(self.siteurlqueue.qsize())) #if not self.siteurlqueue.qsize(): # import os # os.system('/usr/bin/retrysites') self.keepaliveMutex.acquire() if not duration: duration = self.minkeepalive timedout = [(i[0],i[1]) for i in self.alivecrawlers.get(machine,[]) if time.time()-float(i[3])>duration] #self.printLog('URLs timing out:'+str(timedout)) #self.printLog('All URLs being crawled:'+str([(i[0],i[1],i[3],time.time(),time.time()-i[3]) for i in self.alivecrawlers.get(machine,[])])) for site in timedout: self.putSiteURL(site[0]) self.removeSiteFromAlive(site[0],machine) self.printLog('Site added to queue again' + str(site[0])) self.keepaliveMutex.release() return [i[1] for i in timedout] def keepAlive(self,site,pid,machine): """Function for making sure a site for a crawl is running Keyword arguments: site -- the site crawling e.g. www.foo.com pid -- the PID of the current crawler e.g. machine -- The machine the site is being crawled from Returns None Example: >>> siteurlserver.keepAlive('www.foo.com',1234,'eiao2.eiao.net') """ #Checking is the site,pid,machine exists #self.printLog('Keep Alive recieved from site %(site)s, pid %(pid)d and machine %(machine)s'%locals()) self.keepaliveMutex.acquire() already = self.alivecrawlers.get(machine,[]) existing = [i for i in already if (i[0],i[1],i[2])==(site,pid,machine)] for e in existing: already.remove(e) already.append((site,pid,machine,time.time())) self.alivecrawlers[machine] = already self.keepaliveMutex.release() def removeSiteFromAlive(self,site,machine): """Function for removing site sites from the alivelog Keyword arguments: site -- Site to be removed Returns None Example: >>> siteurlserver.removeSiteFromAlive('www.foo.com',sco.urlserver) """ #Note that this function is note mutexed since it is only called from other internal functions which are mutexed. existing = [i for i in self.alivecrawlers.get(machine,[])] #existing = [i for i in self.alivecrawlers if i[0]==site] for e in existing: if e[0]==site: existing.remove(e) self.alivecrawlers[machine] = existing def printLog(self,info): """Printing info to log Keyword arguments: info -- Info to log """ log.write(str(info)+'\n') def urlExists(self,siteurllist,testrun): # Existence test/consistency test. Do not allow URL overlap # within one testrun! try: self.siteurlexists[testrun] except KeyError: self.siteurlexists[testrun]={} for url in siteurllist: try: self.siteurlexists[testrun][url] self.errorMessage="""ERROR: Not allowed to evaluate same URL twice within the same test run. Offending URL: %s""" % (url) return True except KeyError: self.siteurlexists[testrun][url]=1 continue return False def getSiteURL(self): """get a site URL from the queue Returns a tuple consisting of: 1. siteurl as string 2. testrun as integer 3. etl-server with port as string 4. number of failing results 5. timeoutcounter (internal use). """ while self.testrunid and self.siteurlqueue.qsize() == 0: if self.timeoutdict != {}: #print "remaining sites:", self.timeoutdict.keys() #self.printLog("waiting for testrun to finish...") time.sleep(5) #we really don't need the timeouts to be handled before the queue is empty! #self.timeoutHandling() else: #delete testrun including change currenttestrun and clear timeoutdict self.removeTestRun(self.testrunid) self.writeCache() while not self.testrunid: #self.printLog("waiting for another testrun to show up...") time.sleep(15) self.timeoutdictMutex.acquire() #Now get the siteurl try: siteURL = self.siteurlqueue.get() #self.printLog(siteURL) siteURL=(siteURL[0],siteURL[1],siteURL[2],siteURL[3],siteURL[4]+1, time.time()) except Queue.Empty: self.finishedflag = True # Current testrun finished. Remove it. del(self.siteurlexists[self.testrunid]) siteURL = None else: #add the URL to the timeout list! self.timeoutdict[siteURL[0]]=siteURL #print self.timeoutdict self.printLog('Returning:'+str(siteURL)) self.printLog('current testrun:' + str(self.testrunid)) self.printLog('Number of sites left in current testrun:' + str(self.siteurlqueue.qsize())) self.timeoutdictMutex.release() return siteURL def testrunId(self): return self.testrunid def finished(self): return self.finishedflag def error(self): return self.errorMessage def addTestRun(self,etlserver,numurls,testrun): """Function for adding testrun so we do not have to write the same code several times. Keyword arguments: etlserver -- ETL Server URL, numurls -- Number of urls, testrun -- testrun, """ self.mc.flush_all() self.etlserver = etlserver self.etls=SOAPpy.SOAPProxy(self.etlserver) return self.etls.testRun(numurls,testrun) def addURLsToExistingTestrun(self,testrun,etlserver,siteurltable = "Site"): if not etlserver: etlserver = self.etlserver con = psycopg.connect(host=sco.urlrephost, user=sco.urlrepusername, database=sco.urlrepdatabase, password=sco.urlreppassword) cur = con.cursor() siteurllist=[] try: cur.execute('select distinct domain from %s where isactive=True;' % siteurltable) except psycopg.ProgrammingError: con.close() self.printLog('Adding testrun failed:'+str(testrun)) self.printLog('Nonexisting table: '+str(siteurltable)) return False urllist=cur.fetchall() #Hack for forcing URls that have not already been crawled con = psycopg.connect(user=sco.dwuser, database=sco.dwdatabase, password=sco.dwpassword,host=sco.dwhost) cur = con.cursor() cur.execute("select distinct site from datastaging.site natural join datastaging.resource natural join datastaging.resourceversion where testrunid=%(testrun)s;",locals()) dwsites = ['http://'+i[0] for i in cur.fetchall()] con.close() #End of hack siteurlsites = [i[0] for i in list(self.siteurlqueue.queue)] siteurllist = [] for siteURL in urllist: siteURL = siteURL[0].strip() if not siteURL.startswith('http'): siteURL = 'http://' + siteURL #Hack for only readadding sites that have not yet been crawled. if siteURL not in dwsites and siteURL not in siteurlsites: siteurllist.append(siteURL) print 'Adding site:',siteURL,'since this is not present in the DW.' else: print 'Not adding site:',siteURL,'since this is already in the DW.' #End of hack random.shuffle(siteurllist) for siteURL in siteurllist: urltuple = (siteURL, testrun, etlserver, 0, 0) self.siteurlqueue.put(urltuple) try: self.siteurlexists[testrun][siteURL]=1 except: self.siteurlexists[testrun] = {} self.siteurlexists[testrun][siteURL]=1 def addURLsFromTable(self, testrun, etlserver=None, siteurltable="Site"): """add another batch of site urls from a table print 'URL server started and listening on port %d.'% port print 'Number of sites to crawl:%d'% self.siteurlqueue.qsize() """ if not etlserver: etlserver = self.etlserver #if self.finishedflag == False: # self.printLog('Can not interrupt ongoing testrun:'+str(self.testrunid)) # return False #connect to database con = psycopg.connect(host=sco.urlrephost, user=sco.urlrepusername, database=sco.urlrepdatabase, password=sco.urlreppassword) #con.autocommit() cur = con.cursor() #get all data siteurllist=[] try: #cur.execute("select distinct domain from eiaourlrep.site where isactive=True and keyword like '%eGovMon%';") cur.execute("select distinct domain from eiaourlrep.site where isactive=True;") except psycopg.ProgrammingError: con.close() self.printLog('Adding testrun failed:'+str(testrun)) self.printLog('Nonexisting table: '+str(siteurltable)) return False urllist=cur.fetchall() #close the connection again con.close() #Hack for only readadding sites that have not yet been crawled. #con = psycopg.connect(user='eiaodw',database='egovmondw',password='eiaodw',host='eiao1.eiao.net'); #cur = con.cursor() #cur.execute("select distinct site from datastaging.site natural join datastaging.resource natural join datastaging.resourceversion where testrunid='200805';") #dwsites = ['http://'+i[0] for i in cur.fetchall()] dwsites = [] #con.close() #End of hack siteurllist=[] # Clean up the list for siteURL in urllist: siteURL = siteURL[0].strip() if not siteURL.startswith('http'): siteURL = 'http://' + siteURL #Hack for only readadding sites that have not yet been crawled. if siteURL not in dwsites: siteurllist.append(siteURL) print 'Adding site:',siteURL,'since this is not present in the DW.' else: print 'Not adding site:',siteURL,'since this is already in the DW.' #End of hack #siteurllist.append(siteURL) # Do not allow duplicate evaluaton of web sites within test run if self.urlExists(siteurllist,testrun): return False #send messeage to ETL server about the new testrun result = self.addTestRun(etlserver,len(siteurllist),testrun) if not result: self.printLog('Adding testrun failed:'+str(testrun)) return False else: self.testrunid=testrun self.printLog('Number of sites:'+str(len(siteurllist))) random.shuffle(siteurllist) #read list and put each line in the queue for siteURL in siteurllist: urltuple = (siteURL, testrun, etlserver, 0, 0) self.siteurlqueue.put(urltuple) self.siteurlexists[testrun][siteURL]=1 self.finishedflag = False self.writeCache() return True def addURLs(self, testrun, etlserver=None, siteurls=[]): """add another batch of site urls from a file """ if not etlserver: etlserver = self.etlserver #if self.finishedflag == False: # self.printLog('Can not interrupt ongoing testrun:'+str(self.testrunid)) # return False self.printLog(' '.join((str(testrun), str(siteurls), str(etlserver)))) siteurllist=[] if type(siteurls)==type(""): siteurllist.append(siteurls.strip()) #print siteurllist else: siteurllist=[i.strip() for i in siteurls] # Do not allow to re-evaluate a web site twice within same testrun if self.urlExists(siteurllist,testrun): return False #send messeage to ETL server about the new testrun #self.etlserver = etlserver #self.etls=SOAPpy.SOAPProxy(self.etlserver) #result = self.etls.testRun(len(siteurllist),testrun) result = self.addTestRun(etlserver,len(siteurllist),testrun) if not result: self.printLog('Adding testrun failed:'+str(testrun)) return False else: self.testrunid=testrun self.printLog('Number of sites:'+str(len(siteurllist))) con = psycopg.connect(user=sco.dwuser, database=sco.dwdatabase, password=sco.dwpassword,host=sco.dwhost) cur = con.cursor() cur.execute("select distinct site from datastaging.site natural join datastaging.resource natural join datastaging.resourceversion where testrunid=%(testrun)s;",locals()) dwsites = ['http://'+i[0] for i in cur.fetchall()] con.close() newsiteurllist = [] for siteURL in siteurllist: #siteURL = siteURL[0].strip() siteURL = siteURL.strip().rstrip('/') if not siteURL.startswith('http'): siteURL = 'http://' + siteURL #Hack for only readadding sites that have not yet been crawled. if siteURL not in dwsites and siteURL not in [i[0] for i in list(self.siteurlqueue.queue)]: newsiteurllist.append(siteURL) print 'Adding site:',siteURL,'since this is not present in the DW.',dwsites else: print 'Not adding site:',siteURL,'since this is already in the DW.' #End of hack random.shuffle(newsiteurllist) #read list and put each line in the queue for siteURL in newsiteurllist: urltuple = (siteURL, testrun, etlserver, 0, 0) self.siteurlqueue.put(urltuple) self.siteurlexists[testrun][siteURL]=1 self.finishedflag = False self.writeCache() return True def putSiteURL(self, siteurl, testrun=0, etlserver=None, schedulecount=0): """put a site URL back on the queue after a failure """ if not etlserver: etlserver = self.etlserver #remove from timeout-list! self.timeoutdictMutex.acquire() #self.printLog('putSiteURL: '+str(siteurl)) if testrun==0: testrun=self.testrunid if siteurl in self.timeoutdict.keys() and testrun == self.testrunid: if schedulecount==0: schedulecount=self.timeoutdict[siteurl][4] self._DoPutSiteURL(siteurl, testrun, etlserver, schedulecount) else: #print "Various warnings" #print testrun #print self.testrunid if testrun != self.testrunid: self.printLog('WARNING: Testrun '+str(testrun)+' is not the currently active testrun, site url is not added to queue.') self.etls.siteFailed(testrun) if not siteurl in self.timeoutdict.keys(): self.printLog('WARNING: site is not in timeoutqueue. Site:'+str(siteurl)) self.timeoutdictMutex.release() def _DoPutSiteURL(self, siteurl, testrun, etlserver, schedulecount): """put a site URL back on the queue after a failure """ if siteurl.startswith('http://'): siteurl = urlparse.urlsplit(siteurl)[1] #This makes sure that only domain names are added as sites. #I.e. 'http://www.example.com' is an illegal site name. In contrast, 'www.example.com' is a legal site name. try: self.timeoutdict.pop(siteurl) except KeyError: self.printLog('Not in timeout:'+str(siteurl)) #It should be possible to add a site on the queue, even if it has not already been scheduled for crawling (I.e. is not in timeout-dict). siteurlexists = [data for data in self.siteurlqueue.queue if (siteurl==data[0])] if siteurlexists: self.printLog("WARNING: attempting to add URL already in queue: "+ siteurl) return if schedulecount >= self.maxtries: self.printLog('WARNING: Because of too many failed attempts, '+ str(siteurl) +' can not be added to the testrun') self._sendEmail(siteurl) self.etls.siteFailed(testrun) self.urlrepo.incrSchedulingError(siteurl) return else: print 'Adding site for another crawl:',siteurl self.siteurlqueue.put((siteurl,testrun, etlserver,schedulecount, 0)) def finishedSiteURL(self, siteurl,machine): """remove a site URL from the timeout-list """ self.timeoutdictMutex.acquire() self.removeSiteFromAlive(siteurl,machine) self.printLog('finishedSiteURL: '+str(siteurl)) if siteurl in self.timeoutdict: self.timeoutdict.pop(siteurl) self.timeoutdictMutex.release() return True else: self.printLog('WARNING: finished site url is not in the timeout dictionary!') self.timeoutdictMutex.release() return False def removeTestRun(self, testrun="ALL", etlserver=None): if not etlserver: etlserver = self.etlserver if not self.testrunid: self.printLog('No active testruns to remove.') return False if testrun=="current": testruns=[self.testrunid] elif testrun=="ALL": testruns=self.siteurlexists.keys() else: testruns=[testrun] if self.urlstomail: self._sendEmails() for r in testruns: try: self.etls.removeTestRun(r) self.printLog('Removed testrun'+ str(r)) self.finishedflag = True del(self.siteurlexists[r]) # Remove all URL data for testrun q=[i for i in self.siteurlqueue.queue if i[TESTRUN]==r] self.siteurlqueue.queue=collections.deque(q) except: self.printLog('Cannot remove testrun' +str(r)) return False try: allTestruns=self.siteurlexists.keys() allTestruns.sort() self.testrunid=allTestruns[0] except IndexError: self.testrunid = None return True def _sendEmail(self, url): #can not happen more than once at a time because of locking of calling function self.urlstomail.append(url) if self.urlstomailtimer < time.time()-3600: try: self._sendEmails() except: print 'Error sending e-mail' self.urlstomailtimer = time.time() self.urlstomail = [] def _sendEmails(self): server = smtplib.SMTP(sco.smtpserver) #email = 'From: siteurlserver@eiao.net\r\nTo:'+ sco.administratoremail + '\r\nsubject:SiteURL failed \r\n\r\n'+'The following site url failed repeatedly in testrun '+ str(self.testrunid)+' and was removed: '+url +'\r\n' email = 'From: '+sco.administratoremail+'\r\nTo: '+ sco.administratoremail + ' \r\nsubject:SiteURL failed \r\n\r\n'+'The following site urls failed repeatedly in testrun '+ str(self.testrunid)+' and was removed: ' + str (self.urlstomail) +'\r\n' #try: server.sendmail(sco.administratoremail,sco.administratoremail,email) server.quit() #except: # self.printLog('Sending warning email failed for ' +str(url)) def addExceedMemory(self,pid): self.exceedmemory.append(pid) def doiexceedmemory(self,pid): if pid in self.exceedmemory: self.exceedmemory.remove(pid) return True else: return False #def timeoutHandling(self): # #check the URLs in the timeout list! # self.timeoutdictMutex.acquire() # timeouts = [u for u in self.timeoutdict.keys() if self.timeoutdict[u][5]][-f | --force-reload] [-h|--help] [-i|--import directory]" print "-f option causes the pickled state to be disregarded" def main(port): sus = SiteURLServer2(port) host = urlparse.urlparse(urllib.unquote(sco.siteurlserver))[1] if ':' in host: host = host[:host.find(':')] server = SOAPpy.ThreadingSOAPServer((host, sus.port)) #server = SOAPpy.ThreadingSOAPServer((host,8899)) # Disable timeout on the server socket #server.socket.set_timeout(None) server.registerFunction(sus.addURLsFromTable) server.registerFunction(sus.addURLs) server.registerFunction(sus.getSiteURL) server.registerFunction(sus.putSiteURL) server.registerFunction(sus.finishedSiteURL) server.registerFunction(sus.removeTestRun) server.registerFunction(sus.getdbname) server.registerFunction(sus.keepAlive) server.registerFunction(sus.notAlive) server.registerFunction(sus.removeSiteFromAlive) server.registerFunction(sus.finished) server.registerFunction(sus.testrunId) server.registerFunction(sus.error) server.registerFunction(sus.finishedSiteURL) server.registerFunction(sus.addExceedMemory) server.registerFunction(sus.doiexceedmemory) server.registerFunction(sus.addURLsToExistingTestrun) try: server.serve_forever() except KeyboardInterrupt: sys.exit("Siteurlserver was stopped.") sys.exit(0) def daemonize(): # Disconnect from controlling TTY as a service try: pid = os.fork() if pid > 0: sys.exit(0) except OSError, e: print >>sys.stderr, "fork #1 failed: %d (%s)" % (e.errno, e.strerror) sys.exit(1) # Do not prevent unmounting... os.chdir("/") os.setsid() os.umask(0) # do second fork try: pid = os.fork() if pid > 0: # exit from second parent, print eventual PID before #print "Daemon PID %d" % pid open(PIDFILE,'w').write("%d"%pid) sys.exit(0) except OSError, e: print >>sys.stderr, "fork #2 failed: %d (%s)" % (e.errno, e.strerror) sys.exit(1) # Redirect stdout/stderr to log file sys.stdout=sys.stderr=log # UID and GID Nobody os.setegid(99) os.seteuid(99) if __name__== "__main__": port=8889 # Service port number daemonise=False (opts,args)=getopt.getopt(sys.argv[1:],"dp:fhi",["daemonise","port=","force-reload","help",'import']) for opt,arg in opts: if opt in ("-p","--port"): port=int(arg) if opt in ("-f","--force-reload"): forceReload=True if opt in ("-d","--daemonise"): daemonise=True if opt in ("-h","--help"): sys.exit(usage()) if opt in ('-i','--import'): if raw_input('This will the delete all content of the URL repository and instert the basic URLs. Are you sure you want to do that (y/n). No as default.').lower() in ('y','yes'): directory = sys.argv[sys.argv('-i') + 1] import os files = [directory+l for l in os.listdir(directory) if l.endswith('.csv')] if not files: print 'No csv-files found in directory:',directory,'.Please make sure this directory includes at least one csv-file.' else: ur=URLRepUtils3(sco,deletemodel=True) for f in files: print 'Adding sites from file',f ur.addInitialURLs(f) if len(args)>0: sys.exit(usage()) if daemonise: daemonize() else: print "Starting Site URL server on localhost, port",port main(port)