"""Module for performing adaptive sampling""" # Copyright 2005, 2006 EIAO Consoritum # This program is distributed under the terms of the GNU General # Public License. # # This file is part of the European Internet Accessibility Observatory # (EIAO) # # EIAO is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # EIAO is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with EIAO; if not, write to the Free Software # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, # MA 02110-1301 USA __author__ = "Morten Goodwin Olsen" __maintainer__ = 'Morten Goodwin Olsen' __version__ = "$Id$" from math import * from pygsl.statistics import * import RDF import time import pdb if not __name__ == 'AdaptiveSampling.adaptivesampling': #Hack for making documentation generation possible from re import compile,DOTALL import sc from adaptivesamplingerror import * global samplenum samplenum = 0 global sitedictionary sitedictionary = {} global siteresultdict siteresultdict = {} global siteerrorcount siteerrorcount = 0 global sitepasscount sitepasscount = 0 #Precompiled regular expressions if not __name__ == 'AdaptiveSampling.adaptivesampling': rdfid = compile('rdf:ID=\".+?\"') earlresult = compile(r'earl:result rdf:ID.+?strawman\#....',DOTALL) def isFloat(value): try: float(value) return True except ValueError: return False def translateDict(value): """Translates the WAMIDs to URLs to make quicker lookups possible Keyword arguments: value -- WAM ID not as URL returns WAM ID as URL Examples: >>> translateDict('EIAO:B10.11.01.001.001-001') 'http://www.eiao.net/rdf/1.0/#www.eiao.net.1.0.RelaxedWAM.EIAO.B10.11.01.001.001-001' >>> translateDict('Imergo:B10.10.01.001.001-001.001') 'http://www.eiao.net/rdf/1.0/#www.fit.fraunhofer.de.ergebnisse.imergo.Imergo.B10.10.01.001.001-001.001' """ eiaourl = 'http://www.eiao.net/rdf/1.0/#www.eiao.net.1.0.RelaxedWAM.EIAO.' imergourl = 'http://www.eiao.net/rdf/1.0/#www.fit.fraunhofer.de.ergebnisse.imergo.Imergo.' return value.replace('EIAO:',eiaourl).replace('Imergo:',imergourl) def removeTail(value): """Removes the tail of a string to retrieve the actual ID keyword arguments: value -- WAM ID with tail returns WAM ID without tail Example: >>> removeTail('http://www.eiao.net/rdf/1.0/#www.eiao.net.1.0.RelaxedWAM.EIAO.B10.09.01.001.001-001.001-V402382') 'http://www.eiao.net/rdf/1.0/#www.eiao.net.1.0.RelaxedWAM.EIAO.B10.09.01.001.001-001' """ return value[:value.rfind('.')] if not __name__ == 'AdaptiveSampling.adaptivesampling': global cwamvalues sc = sc.SystemConfiguration() cwamvalues = dict([(translateDict(a[0]),float(a[1])) for a in [line.split('\t') for line in open(sc.barriercomputationfile).readlines()] if isFloat(a[1])]) def dynamic(reacherrmarg,site,earlsamples,urlrep,ci,minscenario,writeresult=True): """Perform dynamic sampling Keyword arguments: reacherrmarg -- Error margin to reach site -- The site this sample is related to earlsample -- Complete EARL for this one sample writeresult -- If the aggregated results should be counted as being part of the site. """ global cwamvalues global sitedictionary if not sitedictionary.has_key(site): sitedictionary[site] = [] earlsample = ''.join(earlsamples) if writeresult: sitedictionary[site] = sitedictionary[site]+[getEARLCWAM2(earlsample)] errmargin = 1 num = 0 avg,stddev,errmargin = getAvgCWAM(site,ci) else: errmargin = 1 num = 0 avg,stddev,errmargin = getAvgCWAM(site,ci,sitedictionary[site]+[getEARLCWAM2(earlsample)],urlrep.getsamplecount()) if urlrep.getsamplecount()>=int(sc.minsamplecount): return True,avg,stddev,errmargin else: return False,avg,stddev,errmargin #End of FIXME if errmargin>reacherrmarg or urlrep.getsamplecount()<=minscenario: return False,avg,stddev,errmargin else: return True,avg,stddev,errmargin def getAvgCWAM(site,ci,cwams=[],samplecount=0): """Returns CWAM value for samples Keyword argurments: samples -- All samples to be aggregated returns average, standard deviation and error margin """ #Note: This is called directly! #Used for getting the final site result in the end #Returns average from brps algorithm, and sd/err-marg from average over pages average = 0 standarddev = 0 errmarg = 0 global samplenum samplenum = 0 try: if not cwams: cwams = [sample for sample in sitedictionary[site] if sample] except KeyError: raise NoSiteInformationError(site) #average = mean(cwams) global sitepasscount global siteerrorcount errorlist = [1 for i in range(siteerrorcount)] passlist = [0 for i in range(sitepasscount)] average = getBarrierRatioPerSite(sitepasscount,siteerrorcount) #stddev = sd_m(cwams,average) stddev = sd_m(errorlist+passlist,average) if cwams: #standarddev = variance(cwams)#sd_m(cwams,average) #errmarg = 1.96*stddev/(sqrt(len(cwams))) #errmarg = 2.326*stddev/(sqrt(len(cwams))) if sitepasscount+siteerrorcount>0: errmarg = ci*stddev/(sqrt(sitepasscount+siteerrorcount)) #errmarg = 2.326*stddev/(sqrt(sitepasscount+siteerrorcount)) else: errmarg = 0 else: standarddev = 0 errmarg = 0 return average,stddev,errmarg def getEARLCWAM2(EARL, scorealgorithm="brps"): """Getting CWAM Value of an EARL Sample without usering RedLand Keyword arguments: EARL -- Valid EARL as Strign scorealgorithm -- the algorithm for calculating score: "uwem", "brpp", "brps", "brdpp_s", "brdps_s", "brdpp_p", "brdps_p" Returns CWAM value of the EARL sample Example: >>> earl = '' >>> print getEARLCWAM2(earl) 0.5 """ #Performing adaptive sampling using simple regular expressions instead of redland for parsing, made the changed the speed to 1.39s from 106.6s. In other words almost 100 times performence increase. Because of this getEARLCWAM2 is preferable to use instead of getEARLCWAM. pre = time.time() if scorealgorithm=="uwem": sample = list(set([''.join(['http://www.eiao.net/rdf/1.0/#',removeTail(rdfid.findall(s)[0][8:-1])]) for s in earlresult.findall(EARL) if s.endswith('fail')])) #f = open('/var/log/eiao/time.log','a') #f.write('Getting samples (adaptive sampling):'+str(time.time()-pre) + '\n') #f.close() return getCWAM(sample) else: errorcount = EARL.count('nmg-strawman#fail') passcount = EARL.count('nmg-strawman#pass') #f = open('/var/log/eiao/time.log','a') #f.write('Getting samples (adaptive sampling):'+str(time.time()-pre) + '\n') #f.close() #Save the samples for overall site calculation and return page result global siteerrorcount global sitepasscount siteerrorcount = siteerrorcount + errorcount sitepasscount = sitepasscount + passcount return getBarrierRatioPerPage(passcount, errorcount) def getEARLCWAM(EARL): """Getting CWAM Value of an EARL Sample Keyword arguments: EARL -- Valid EARL as Strign Returns CWAM value of the EARL sample Example: >>> earl = '' >>> print getEARLCWAM(earl) 0.0975 """ model = RDF.Model() pars = RDF.RDFXMLParser() pars.parse_string_into_model(model, EARL, base_uri="http://www.eiao.net/rdf/1.0#") stat = RDF.Statement(subject = None, predicate = None, object = RDF.Uri('http://www.w3.org/WAI/ER/EARL/nmg-strawman#fail')) statements = model.find_statements(stat) sample = [removeTail(str(statement.subject.uri)) for statement in statements] return getCWAM(sample) def getCWAM(sample): """Returns CWAM value for one sample Keyword arguments: sample -- One Sample """ #Old deprecated function, only for old UWEM score algorithm calculation. pre = time.time() temp = 1 for result in sample: #Temporary fix to accept unknown IDs temp *= (1-cwamvalues.get(result,0.02)) #f = open('/var/log/eiao/time.log','a') #f.write('Getting CWAM:'+str(time.time()-pre)+'\n') #f.close() return 1-temp # New functions for testing of score functions def getBarrierRatioPerSite(sitepasscount,siteerrorcount): if siteerrorcount + sitepasscount == 0: return 0 return siteerrorcount / float(siteerrorcount + sitepasscount) def getBarrierRatioPerPage(passcount, errorcount): if errorcount + passcount == 0: return 0 return errorcount / float(errorcount + passcount) def get_D_S(siteresultdict, errorcount): """Barrier diversity of web site""" tmp = 0 for result in siteresultdict: tmp = tmp + ((siteresultdict[result][1] / float(errorcount)) * (siteresultdict[result][1] / float(errorcount))) #print tmp return 1 - tmp if __name__ == '__main__': #import doctest #doctest.testmod() #getFCUI() #print cwamvalues from sc import * sc = SystemConfiguration() from urlrep import URLRepUtils3 urlrep = URLRepUtils3(sc) import random files = ['input.earl','input2.earl','input3.earl','input4.earl','input5.earl','input6.earl'] #files = [files[0],files[5]] #for i in range(1000): # urlrep.addSample('www.test.no',numofsamples=1) # if random.uniform(0,1)<0.5: # onefile = random.sample(files,1)[0] #print dynamic(0.05,'www.test.no',[''.join(open(random.sample(files,1)[0],'r').readlines())],urlrep) # print urlrep.getsamplecount(),dynamic(0.05,'www.test.no',[''.join(open(file,'r').readlines()) for file in files],urlrep) # else: # print urlrep.getsamplecount(),dynamic(0.05,'www.test.no',[''.join(open(random.sample(files,1)[0],'r').readlines())],urlrep) #print dynamic(0.05,'www.test.no',[''],urlrep) #print dynamic(0.05,'www.test.no',[''.join(open('input6.earl','r').readlines()[:1000])],urlrep) #import time #pre = time.time() for i in range(100): print dynamic(0.05,'www.test.no',[''.join(open('input5.earl','r').readlines()),''.join(open('input.earl','r').readlines())],urlrep) print dynamic(0.05,'www.test.no',''.join(open('input2.earl','r').readlines()),urlrep) print dynamic(0.05,'www.test.no',''.join(open('input3.earl','r').readlines()),urlrep) print dynamic(0.05,'www.test.no',''.join(open('input4.earl','r').readlines()),urlrep) print dynamic(0.05,'www.test.no',''.join(open('input5.earl','r').readlines()),urlrep) #print 'Total time:',time.time()-pre print 'www.test.no',getAvgCWAM('www.test.no') #all = [dynamic(0.05) for a in range(100)] #print float(sum(all))/len(all) #import pdb #pdb.set_trace() #print getAvgCWAM(range(5))