"""Module for performing adaptive sampling"""
# Copyright 2005, 2006 EIAO Consoritum
# This program is distributed under the terms of the GNU General
# Public License.
#
# This file is part of the European Internet Accessibility Observatory
# (EIAO)
#
# EIAO is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# EIAO is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with EIAO; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston,
# MA 02110-1301 USA
__author__ = "Morten Goodwin Olsen"
__maintainer__ = 'Morten Goodwin Olsen'
__version__ = "$Id$"
from math import *
from pygsl.statistics import *
import RDF
import time
import pdb
if not __name__ == 'AdaptiveSampling.adaptivesampling':
#Hack for making documentation generation possible
from re import compile,DOTALL
import sc
from adaptivesamplingerror import *
global samplenum
samplenum = 0
global sitedictionary
sitedictionary = {}
global siteresultdict
siteresultdict = {}
global siteerrorcount
siteerrorcount = 0
global sitepasscount
sitepasscount = 0
#Precompiled regular expressions
if not __name__ == 'AdaptiveSampling.adaptivesampling':
rdfid = compile('rdf:ID=\".+?\"')
earlresult = compile(r'earl:result rdf:ID.+?strawman\#....',DOTALL)
def isFloat(value):
try:
float(value)
return True
except ValueError:
return False
def translateDict(value):
"""Translates the WAMIDs to URLs to make quicker lookups possible
Keyword arguments:
value -- WAM ID not as URL
returns WAM ID as URL
Examples:
>>> translateDict('EIAO:B10.11.01.001.001-001')
'http://www.eiao.net/rdf/1.0/#www.eiao.net.1.0.RelaxedWAM.EIAO.B10.11.01.001.001-001'
>>> translateDict('Imergo:B10.10.01.001.001-001.001')
'http://www.eiao.net/rdf/1.0/#www.fit.fraunhofer.de.ergebnisse.imergo.Imergo.B10.10.01.001.001-001.001'
"""
eiaourl = 'http://www.eiao.net/rdf/1.0/#www.eiao.net.1.0.RelaxedWAM.EIAO.'
imergourl = 'http://www.eiao.net/rdf/1.0/#www.fit.fraunhofer.de.ergebnisse.imergo.Imergo.'
return value.replace('EIAO:',eiaourl).replace('Imergo:',imergourl)
def removeTail(value):
"""Removes the tail of a string to retrieve the actual ID
keyword arguments:
value -- WAM ID with tail
returns WAM ID without tail
Example:
>>> removeTail('http://www.eiao.net/rdf/1.0/#www.eiao.net.1.0.RelaxedWAM.EIAO.B10.09.01.001.001-001.001-V402382')
'http://www.eiao.net/rdf/1.0/#www.eiao.net.1.0.RelaxedWAM.EIAO.B10.09.01.001.001-001'
"""
return value[:value.rfind('.')]
if not __name__ == 'AdaptiveSampling.adaptivesampling':
global cwamvalues
sc = sc.SystemConfiguration()
cwamvalues = dict([(translateDict(a[0]),float(a[1])) for a in [line.split('\t') for line in open(sc.barriercomputationfile).readlines()] if isFloat(a[1])])
def dynamic(reacherrmarg,site,earlsamples,urlrep,ci,minscenario,writeresult=True):
"""Perform dynamic sampling
Keyword arguments:
reacherrmarg -- Error margin to reach
site -- The site this sample is related to
earlsample -- Complete EARL for this one sample
writeresult -- If the aggregated results should be counted as being part of the site.
"""
global cwamvalues
global sitedictionary
if not sitedictionary.has_key(site):
sitedictionary[site] = []
earlsample = ''.join(earlsamples)
if writeresult:
sitedictionary[site] = sitedictionary[site]+[getEARLCWAM2(earlsample)]
errmargin = 1
num = 0
avg,stddev,errmargin = getAvgCWAM(site,ci)
else:
errmargin = 1
num = 0
avg,stddev,errmargin = getAvgCWAM(site,ci,sitedictionary[site]+[getEARLCWAM2(earlsample)],urlrep.getsamplecount())
if urlrep.getsamplecount()>=int(sc.minsamplecount):
return True,avg,stddev,errmargin
else:
return False,avg,stddev,errmargin
#End of FIXME
if errmargin>reacherrmarg or urlrep.getsamplecount()<=minscenario:
return False,avg,stddev,errmargin
else:
return True,avg,stddev,errmargin
def getAvgCWAM(site,ci,cwams=[],samplecount=0):
"""Returns CWAM value for samples
Keyword argurments:
samples -- All samples to be aggregated
returns average, standard deviation and error margin
"""
#Note: This is called directly!
#Used for getting the final site result in the end
#Returns average from brps algorithm, and sd/err-marg from average over pages
average = 0
standarddev = 0
errmarg = 0
global samplenum
samplenum = 0
try:
if not cwams:
cwams = [sample for sample in sitedictionary[site] if sample]
except KeyError:
raise NoSiteInformationError(site)
#average = mean(cwams)
global sitepasscount
global siteerrorcount
errorlist = [1 for i in range(siteerrorcount)]
passlist = [0 for i in range(sitepasscount)]
average = getBarrierRatioPerSite(sitepasscount,siteerrorcount)
#stddev = sd_m(cwams,average)
stddev = sd_m(errorlist+passlist,average)
if cwams:
#standarddev = variance(cwams)#sd_m(cwams,average)
#errmarg = 1.96*stddev/(sqrt(len(cwams)))
#errmarg = 2.326*stddev/(sqrt(len(cwams)))
if sitepasscount+siteerrorcount>0:
errmarg = ci*stddev/(sqrt(sitepasscount+siteerrorcount))
#errmarg = 2.326*stddev/(sqrt(sitepasscount+siteerrorcount))
else:
errmarg = 0
else:
standarddev = 0
errmarg = 0
return average,stddev,errmarg
def getEARLCWAM2(EARL, scorealgorithm="brps"):
"""Getting CWAM Value of an EARL Sample without usering RedLand
Keyword arguments:
EARL -- Valid EARL as Strign
scorealgorithm -- the algorithm for calculating score: "uwem", "brpp", "brps", "brdpp_s", "brdps_s", "brdpp_p", "brdps_p"
Returns CWAM value of the EARL sample
Example:
>>> earl = ''
>>> print getEARLCWAM2(earl)
0.5
"""
#Performing adaptive sampling using simple regular expressions instead of redland for parsing, made the changed the speed to 1.39s from 106.6s. In other words almost 100 times performence increase. Because of this getEARLCWAM2 is preferable to use instead of getEARLCWAM.
pre = time.time()
if scorealgorithm=="uwem":
sample = list(set([''.join(['http://www.eiao.net/rdf/1.0/#',removeTail(rdfid.findall(s)[0][8:-1])]) for s in earlresult.findall(EARL) if s.endswith('fail')]))
#f = open('/var/log/eiao/time.log','a')
#f.write('Getting samples (adaptive sampling):'+str(time.time()-pre) + '\n')
#f.close()
return getCWAM(sample)
else:
errorcount = EARL.count('nmg-strawman#fail')
passcount = EARL.count('nmg-strawman#pass')
#f = open('/var/log/eiao/time.log','a')
#f.write('Getting samples (adaptive sampling):'+str(time.time()-pre) + '\n')
#f.close()
#Save the samples for overall site calculation and return page result
global siteerrorcount
global sitepasscount
siteerrorcount = siteerrorcount + errorcount
sitepasscount = sitepasscount + passcount
return getBarrierRatioPerPage(passcount, errorcount)
def getEARLCWAM(EARL):
"""Getting CWAM Value of an EARL Sample
Keyword arguments:
EARL -- Valid EARL as Strign
Returns CWAM value of the EARL sample
Example:
>>> earl = ''
>>> print getEARLCWAM(earl)
0.0975
"""
model = RDF.Model()
pars = RDF.RDFXMLParser()
pars.parse_string_into_model(model, EARL, base_uri="http://www.eiao.net/rdf/1.0#")
stat = RDF.Statement(subject = None, predicate = None, object = RDF.Uri('http://www.w3.org/WAI/ER/EARL/nmg-strawman#fail'))
statements = model.find_statements(stat)
sample = [removeTail(str(statement.subject.uri)) for statement in statements]
return getCWAM(sample)
def getCWAM(sample):
"""Returns CWAM value for one sample
Keyword arguments:
sample -- One Sample
"""
#Old deprecated function, only for old UWEM score algorithm calculation.
pre = time.time()
temp = 1
for result in sample:
#Temporary fix to accept unknown IDs
temp *= (1-cwamvalues.get(result,0.02))
#f = open('/var/log/eiao/time.log','a')
#f.write('Getting CWAM:'+str(time.time()-pre)+'\n')
#f.close()
return 1-temp
# New functions for testing of score functions
def getBarrierRatioPerSite(sitepasscount,siteerrorcount):
if siteerrorcount + sitepasscount == 0:
return 0
return siteerrorcount / float(siteerrorcount + sitepasscount)
def getBarrierRatioPerPage(passcount, errorcount):
if errorcount + passcount == 0:
return 0
return errorcount / float(errorcount + passcount)
def get_D_S(siteresultdict, errorcount):
"""Barrier diversity of web site"""
tmp = 0
for result in siteresultdict:
tmp = tmp + ((siteresultdict[result][1] / float(errorcount)) * (siteresultdict[result][1] / float(errorcount)))
#print tmp
return 1 - tmp
if __name__ == '__main__':
#import doctest
#doctest.testmod()
#getFCUI()
#print cwamvalues
from sc import *
sc = SystemConfiguration()
from urlrep import URLRepUtils3
urlrep = URLRepUtils3(sc)
import random
files = ['input.earl','input2.earl','input3.earl','input4.earl','input5.earl','input6.earl']
#files = [files[0],files[5]]
#for i in range(1000):
# urlrep.addSample('www.test.no',numofsamples=1)
# if random.uniform(0,1)<0.5:
# onefile = random.sample(files,1)[0]
#print dynamic(0.05,'www.test.no',[''.join(open(random.sample(files,1)[0],'r').readlines())],urlrep)
# print urlrep.getsamplecount(),dynamic(0.05,'www.test.no',[''.join(open(file,'r').readlines()) for file in files],urlrep)
# else:
# print urlrep.getsamplecount(),dynamic(0.05,'www.test.no',[''.join(open(random.sample(files,1)[0],'r').readlines())],urlrep)
#print dynamic(0.05,'www.test.no',[''],urlrep)
#print dynamic(0.05,'www.test.no',[''.join(open('input6.earl','r').readlines()[:1000])],urlrep)
#import time
#pre = time.time()
for i in range(100):
print dynamic(0.05,'www.test.no',[''.join(open('input5.earl','r').readlines()),''.join(open('input.earl','r').readlines())],urlrep)
print dynamic(0.05,'www.test.no',''.join(open('input2.earl','r').readlines()),urlrep)
print dynamic(0.05,'www.test.no',''.join(open('input3.earl','r').readlines()),urlrep)
print dynamic(0.05,'www.test.no',''.join(open('input4.earl','r').readlines()),urlrep)
print dynamic(0.05,'www.test.no',''.join(open('input5.earl','r').readlines()),urlrep)
#print 'Total time:',time.time()-pre
print 'www.test.no',getAvgCWAM('www.test.no')
#all = [dynamic(0.05) for a in range(100)]
#print float(sum(all))/len(all)
#import pdb
#pdb.set_trace()
#print getAvgCWAM(range(5))