# -*- coding: utf-8 -*- import time import os #sites = ['www.uĀ­psud.fr','www.comune.bolzano.it','www.fingalcoco.ie','www.gobiernodecanarias.org','www.regione.vda.it','www.ag.ch','www.czestochowa.pl','all'] sites = ['all'] import psycopg testrunid = os.popen('grep "Crawling site" /var/log/eiao/crawler.log | grep rdf').readlines()[-1].split()[-1].split('_')[1] #testrunid = '5094' print 'Testrun:',testrunid con = psycopg.connect(user='eiaodw', database='eiaodwr20', password='') cur = con.cursor() cur.execute('select count(distinct siteid) from datastaging.site natural join datastaging.resource natural join datastaging.resourceversion where testrunid=%(testrunid)s;',locals()) normal = cur.fetchall()[0][0] print 'The number of sites crawler and loaded in the DW:',normal print 'Selected sites:' con = psycopg.connect(user='eiaourlrep', database='eiaourlrep', password='') cur = con.cursor() cur.execute("select domain,count(distinct scenarioid) from page where testrunid=%(testrunid)s group by domain;",locals()) normal = cur.fetchall() print 'Number of sites with 30 or more:',len([i for i in normal if i[1]>=30]) print 'Number of sites with less than 30:',len([i for i in normal if i[1]<30]) print 'Number of crawlers running:',os.popen('ps aux | grep -v grep | grep -c crawlerwrapper').read() for site in sites: if site=='all': print 'Complete testrun' else: print ' ',site if site=='all': cur.execute("select count(distinct scenarioid) from page where testrunid=%(testrunid)s;",locals()) else: cur.execute("select count(distinct scenarioid) from page where domain=%(site)s and testrunid=%(testrunid)s;",locals()) normal = cur.fetchall()[0][0] print ' The number of scenarios loaded is:',normal#cur.fetchall()[0][0] if site=='all': cur.execute("select min(timestampdownloaded),max(timestampdownloaded) from page where testrunid=%(testrunid)s;",locals()) else: cur.execute("select min(timestampdownloaded),max(timestampdownloaded) from page where domain=%(site)s and testrunid=%(testrunid)s;",locals()) d = cur.fetchall() if d[0][0]: if not site=='all': print ' Database: ' + os.popen('grep '+site+' /var/log/eiao/crawler.log | grep Crawling').read().strip().split()[-1] first,last = d[0] if last>first: hours = int((last-first)/60/60) minutes = int((last-first)-hours*60*60)/60 seconds = int((last-first)-hours*60*60-minutes*60) print ' Duration:',hours,'h',minutes,'m',seconds,'s (',int(last-first),' seconds)' print ' Seconds per page:',(last-first)/normal print ' Pages per second:',normal/(last-first) print ' Last scenario:',time.time()-last if not site=='all': finished = os.popen('grep '+site+' /var/log/eiao/crawler.log | grep Finished | grep '+testrunid).read() if finished: print ' Status: Finished crawling.' else: print ' Status: Not finished' con.close()