"""Module for generating RDF"""
# Copyright 2005, 2006 EIAO Consoritum
# This program is distributed under the terms of the GNU General
# Public License.
#
# This file is part of the European Internet Accessibility Observatory
# (EIAO)
#
# EIAO is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# EIAO is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with EIAO; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston,
# MA 02110-1301 USA
__author__ = 'Morten Goodwin Olsen'
__maintainer__ = 'Nils Ulltveit-Moe'
__version__ = 0.1
import time
import string
import md5
import RDF
import urlparse
import datetime
import pdb
import urllib
import cgi
from re import compile,DOTALL
#import re
from RDFgeneratorerror import *
class ExceptionThrower:
"""Class for throwing exceptions"""
def __getattr__(self,name):
raise InstanceNotPresentError('URLRepository or SystemConfiguration',None)
if __name__ == '__main__':
urlrep = ExceptionThrower()
sc = ExceptionThrower()
pagedictionary = {}
assertionlist = {}
regassertion = compile(r'earl:Assertion.rdf:ID=\".+?\"')
regmetadata = compile(r'eiao:MetaData.rdf:ID=\".+?\"')
regtestsubject = compile(r'earl:TestSubject.rdf:about=\".+?\"')
def setInstances(SystemConfiguration, URLRep):
"""Setting instances needed. If this function is not called, prior to the RDF is generated, an exception will be thrown.
Keyword Arguments:
SystemConfiguration -- System Configuration
URLRep -- URL Repository
Returns None.
"""
global sc
sc = SystemConfiguration
global urlrep
urlrep = URLRep
from RDFgeneratorerror import *
rdfbegin = """
"""
rdfend = ""
def __isint(value):
"""Integer checker
Checking if an int is integer
Keyword argument:
value -- Object as integer
returns True if value is integer, False if not integer
Example:
>>> __isint('1')
True
>>> __isint(1)
True
>>> __isint(None)
False
>>> __isint('one')
False
"""
validvalue = True
try:
num = int(value)
except ValueError:
validvalue = False
except TypeError:
validvalue = False
return validvalue
def addpagesurveytourlconnection(pagesurveyid,url):
""" Adding a connection between pagesurveyid and URL
Keyword arguments:
pagesurveyid -- ID of the current pagesurvey
url -- URL to the current page
Returns None
"""
pagedictionary[url] = pagesurveyid
def gettestrunrdf(testrunnr = None):
"""Getting testrun RDF
Generates RDF for a single test run.
Keyword arguments:
testrunnr -- [Optional] The testrun identifier. Gets from the housekeeping if None.
Returns RDF as XML
Examples:
>>> len(gettestrunrdf(1).replace(chr(10),'').replace(chr(32),''))>0
True
"""
validtestrun = True
if testrunnr and not __isint(testrunnr):
raise IllegalTestRunValueError(testrunnr)
if not testrunnr:
testrun = urlrep.getCurrentTestRun()
else:
testrun = 'http://www.eiao.net/rdf/2.0/TestRun_%d' %(testrunnr)
result = u"""
%s
""" %(testrun, time.strftime('%Y-%m-%d'))
result += """
""" %(testrun)
return result.strip()
def getendtestrunrdf(testrunnr = None):
"""Getting testrun RDF
Generates RDF for a single test run.
Keyword arguments:
testrunnr -- [Optional] The testrun identifier. Gets from the housekeeping if None.
Returns RDF as XML
Examples:
>>> getendtestrunrdf(1).replace(chr(10),'') == ' '+time.strftime('%Y-%m-%d')+' '
True
>>> len(getendtestrunrdf().replace(chr(10),''))>0
True
"""
validtestrun = True
if testrunnr and not __isint(testrunnr):
raise IllegalTestRunValueError(testrunnr)
testrun = urlrep.getCurrentTestRun()
result = """
%s
""" %(testrun, time.strftime('%Y-%m-%d'))
return result.strip()
def getsitesurveyrdf(websiteurl, testrunnr = None, sitesurvey = None, downloaddir=None, siteresult=0,variance=0, errormargin=0,urlcount=0,surveynr=None,testrun=None,numbernotdownload=0):
"""Generates site-survey RDF
Generate site survey information
Keyword arguments:
websiteurl -- URL to website
sitesurvey -- [Optional] Site to generate RDF. Gets from housekeeping of none.
downloaddir -- [Optional] Where the current site is downloaded. Generated if None
Returns RDF as XML
Example:
>>> from sc import *
>>> from housekeeping import *
>>> setInstances(SystemConfiguration(), HouseKeeping())
>>> urlrep.setCurrentPageSurvey('www.holekommune.no_pageSurvey_1')
>>> urlrep.setCurrentSiteSurvey('www.holekommune.no_survey_42')
>>> len(getsitesurveyrdf('http://www.holekommune.no',testrunnr=12).replace(chr(10),''))>0
True
"""
if testrunnr and not __isint(testrunnr):
raise IllegealTestRunValueError(testrunnr)
if not sitesurvey:
sitesurvey = urlrep.getCurrentSiteSurvey()
surveylist = sitesurvey.split('_')
if not surveynr:
surveynr = urlrep.getCurrentSiteSurvey(retint=True)
if not __isint(surveynr):
raise IllegalSiteSurveyValueError(sitesurvey)
sitename = surveylist[0]
#if not downloaddir:
# downloaddir = '/'.join([sc.webcachedirectory.rstrip('/'), sc.rdfmodel, str(surveynr), sitename])
# #The downloaddir will be in the following structure /blablabla/model/testrun/site/page
if not testrun:
testrun = urlrep.getCurrentTestRun()
if not siteresult:
barrierindicator = urlrep.getCurrentBarrierIndicator()
else:
barrierindicator = siteresult
if not variance:
variance = urlrep.getCurrentVariance()
if not errormargin:
errormargin = urlrep.getCurrentErrorMargin()
if not urlcount:
urlcount = urlrep.getCurrentURLCount()
#Unreliablewarning
if (int(urlcount) <= int(sc.minurlcount)) or (urlrep.numunavailable >= int(sc.maxunavailable)):
unreliable = True
else:
unreliable = False
#notApplicable
if urlrep.getsamplecount()= int(sc.maxtimeoutcount):
notapplicable = True
else:
notapplicable = False
result = """
%s
%s
%s
%s
%s
%s
%s
""" % (testrun, sitesurvey, sitesurvey, downloaddir,websiteurl, str(barrierindicator),str(variance),str(errormargin),str(urlcount),str(numbernotdownload))
if unreliable:
unreliable = """
1
""" % (sitesurvey)
result += unreliable
if notapplicable:
notapplicable = """
1
""" % (sitesurvey)
result += notapplicable
return result.strip()
def getscenariordf(scenarios, scenarioid,pagesurveyid,barrierindicator,rangelocationid=None,keyusescenario=True,currentsitesurvey=None):
"""Getting scenario RDF
Generating RDF for a scenario
Keyword arguments:
scenarios -- List of scenarios as tuples
rangelocationid -- [Optional] Integer for rangelocation. Retrieves from HouseKeeping if None
keyusescenario -- [Optional] If the scenario should be a keyusescenario or a pagescenario. Keyusescenario as default.
Returns scenario RDF
Examples:
>>> urlrep.setCurrentScenario('www.hole.kommune.no_survey_12_scenario_1')
>>> urlrep.setCurrentSiteSurvey('www.hole.kommune.no_survey_12')
>>> pagedictionary['http://www.hia.no/'] = 'http://www.eiao.net_test_survey_12'
>>> pagedictionary['http://www.hia.no/english'] = 'http://www.eiao.net_test_survey_12'
>>> pagedictionary['http://www.hia.no/sonja'] = 'http://www.eiao.net_test_survey_12'
>>> result = getscenariordf([('http://www.hia.no',1,12,'http://www.hia.no/english',0,0),('http://www.hia.no/english',34,12,'http://www.hia.no/sonja',0,0)],1)
>>> result = result.replace(chr(10),'')
>>> len(result)>0
True
"""
#if keyusescenario:
# #Write walk log to a file
# f = open(sc.loglocation+'crawler_debug.log','a')
# f.write(str(pagedictionary.keys()))
# f.write('\n')
# f.write(str([a[:6] for a in scenarios]))
# f.write('\n')
# f.close()
currentpagesurvey = 'http://www.eiao.net/rdf/2.0#PageSurvey_' + pagesurveyid
currentscenario = 'http://www.eiao.net/rdf/2.0#Scenario_' + scenarioid
if not currentsitesurvey:
currentsitesurvey = urlrep.getCurrentSiteSurvey()
result = """
""" % (currentsitesurvey, currentscenario)
result += '' %(currentscenario)
if keyusescenario:
result += 'keyusescenario'
else:
result += 'pagescenario'
result += ''
#next range location
pages = []
for s in scenarios:
try:
fromurl = s[0]
tolinenumber = s[1]
tocolumnnumber = s[2]
tourl = s[3]
fromlinenumber = s[4]
fromcolumnnumber = s[5]
if fromurl:
#Fromurl will be absent when the scenario starts at a seed-url
urltuple = urlparse.urlsplit(fromurl)
if not urltuple[2]:
urltuple = (urltuple[0],urltuple[1],'/',urltuple[3],urltuple[4])
fromurl = urlparse.urlunsplit(urltuple)
pages.append((fromurl,'end',tolinenumber,tocolumnnumber))
if tourl:
urltuple = urlparse.urlsplit(tourl)
if not urltuple[2]:
urltuple = (urltuple[0],urltuple[1],'/',urltuple[3],urltuple[4])
tourl = urlparse.urlunsplit(urltuple)
pages.append((tourl,'start',fromlinenumber,fromcolumnnumber))
except IndexError:
raise WrongLengthOfLogError(s)
outside = False
l = set()
l = set([a[0] for a in pages])
stem = None
for iurl in l:
if iurl:
if not stem:
stem = urlparse.urlsplit(iurl)[1]
else:
if not stem == urlparse.urlsplit(iurl)[1]:
urlrep.setErrorDetected(True)
#f = open(sc.loglocation+'crawler_debug.log','a')
#f.write('Error detected, too wide random walk.\n')
#f.write(str(stem))
#f.write('\n')
#f.write(str([b[:6] for b in scenarios]))
#f.write('\n')
#f.write(str(l))
#f.write('\n')
#f.close()
for page in set([point[0] for point in pages if point[0]]):
try:
(fromlinenumber, fromcolumnnumber) = [(point[2],point[3]) for point in pages if point[1]=='start' and point[0]==page][0]
except IndexError:
#If line- and column-number does not exist. This is a seed URL starting from line 0 column 0
(fromlinenumber,fromcolumnnumber) = (0,0)
totheend=False
try:
(tolinenumber, tocolumnnumber) = [(point[2],point[3]) for point in pages if point[1]=='end' and point[0]==page][0]
except:
#If line- and column-number does not exist, it means that the scenario ends at the end of the page
totheend=True
rangelocationid = urlrep.getNextRangeLocation(retint=True)
result += '' %(currentscenario)
result += ' ' % (currentscenario, rangelocationid)
result += ' %s' % (str(barrierindicator))
result += ''
result += '' %(currentscenario, rangelocationid)
urltuple = urlparse.urlsplit(page)
if not urltuple[2]:
urltuple = (urltuple[0],urltuple[1],'/',urltuple[3],urltuple[4])
page = urlparse.urlunsplit(urltuple)
result += ' ' % pagesurveyid
result += ' '
result += ' ' %(currentscenario, rangelocationid)
if not totheend:
result += ' ' %(currentscenario, rangelocationid)
page = page[:5]+urllib.quote(page[5:])
result += ' %s' %(page)
result += ''
#Start pointer values
result += '' %(currentscenario, rangelocationid)
result += ' '
result += ' %d' %(fromlinenumber)
result += ' %d' %(fromcolumnnumber)
result += ''
if not totheend:
#End pointer values
result += '' %(currentscenario, rangelocationid)
result += ' '
result += ' %d' %(tolinenumber)
result += ' %d' %(tocolumnnumber)
result += ''
return result.strip()
def getpagesurveyrdf(doctype, authoringtool, header,webcacheurl,language=None,fullurl=None,currentpagesurvey=None):
"""Generting RDF for a pagesurvey
Keyword arguments:
authoringtool -- Authoring tool used
header -- HTTP header as a dictionary
webcacheurl -- URL to the corresponding web cache
language -- [Optional] Language of the site, if present
fullurl -- [Optional] Full url to the web page, if present
Returns pagesurvey RDF
Examples:
>>> from sc import *
>>> from housekeeping import *
>>> setInstances(SystemConfiguration(), HouseKeeping())
>>> urlrep.setCurrentPageSurvey('www.holekommune.no_pageSurvey_1')
>>> len(getpagesurveyrdf('html
PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"', 'Plone',{'Expires':'Thu, 19 Nov 1981 ::00 GMT','Data':'Mon, 22 Aug 2005 ::31 GMT'},'http://eiao2.eao.net/robacc/')[0].replace(chr(10),'').replace(chr(32),''))>0
True
"""
if not header:
raise(EmptyHeaderInformationError())
currentsitesurvey = urlrep.getCurrentSiteSurvey()
if not currentpagesurvey:
currentpagesurvey = urlrep.getCurrentPageSurvey()
#print 'Current page survey:',currentpagesurvey
pagesurveylist = currentpagesurvey.split('_')
pagesurveyid = pagesurveylist[-1]
pagesurveyname = pagesurveylist[0]
surveynr = urlrep.getCurrentSiteSurvey(retint=True)
webcacheurl = webcacheurl[:5]+urllib.quote(webcacheurl[5:])
doctype = doctype.replace('\'','%27').replace('"','%22')
#webcacheurl = '/'.join([sc.webcacheurl, sc.crawlerrdfmodel, surveynr, sitename])
result = '' %(currentpagesurvey)
result += ' %s' % (datetime.datetime.today().isoformat())
result += ' %s' % (cgi.escape(authoringtool))
if fullurl:
fullurl = fullurl[:5]+urllib.quote(fullurl[5:])
result +=' %s'%(str(fullurl))
if language:
result += ' %s' % (cgi.escape(language))
result += ' %s' % (cgi.escape(doctype))
result += ' ' % (webcacheurl)
result += ' ' % (pagesurveyname, pagesurveyid)
result += ''
result += '' % (pagesurveyname, pagesurveyid)
for key,value in header.items():
key = key.replace(' ','')
#Escaping illigal &-characters. Making them possible to parse in the XML.
#value = re.sub(r'\&[^amp;]','&',value)
value = cgi.escape(value)
try:
result += '%s' % (key,value,key)
except UnicodeDecodeError:
result += '%s' % (unicode(key,'iso-8859-1'),unicode(value,'iso-8859-1'),unicode(key,'iso-8859-1'))
result += ''
#print 'Connecting pageSurveys to siteSurveys'
result += """
""" % (currentsitesurvey, currentpagesurvey)
return (result,'http://www.eiao.net/rdf/2.0#PageSurvey_%s' %currentpagesurvey)
def getearlmetadata(EARL):
"""Internal function for retrieving the meta data from an EARL model
Keyword arguments:
EARL -- EARL as an (already parsed) RDF Redland model
returns metadata as a list of strings
"""
return ['http://www.eiao.net/rdf/2.0/#'+s[s.find('"')+1:s.rfind('"')] for s in regmetadata.findall(EARL)]
#return list(set([str(a.subject.uri) for a in EARL.find_statements(RDF.Statement(object = RDF.Uri("http://www.eiao.net/rdf/2.0#MetaData"), predicate = None, subject = None))]))
def getearltestsubject(EARL):
"""Internal function for retrieving the test subject
Keyword argruments:
EARL -- EARL as an (already parsed) RDF Redland model
Returns testsubject as string
"""
return ['http://www.eiao.net/rdf/2.0/#'+s[s.find('"')+1:s.rfind('"')] for s in regtestsubject.findall(EARL)]
#return [str(a.subject.uri) for a in EARL.find_statements(RDF.Statement(object = RDF.Uri("http://www.w3.org/WAI/ER/EARL/nmg-strawman#TestSubject"), predicate = None, subject = None))][0]
def getearl(URL, EARL, pagesurvey=None, preparseEARL = False, assertions = [],addnamespace=True,contentlength=None,calculatedcontentlength=0,type='text/html',uncheckablecounter=None,unavailablecounter=None):
"""Generated RDF of earl and pagesurvey
Keyword arguments:
URL -- URL of the page
EARL -- EARL as xml
pagesurvey -- [Optional] pagesurvey to connect the EARL. Getting current pagesurvey if None.
preparseEARL -- [Optional] If the EARL should be parsed and errors reported (performance reducing). Do not test the EARL as default.
assertions -- [Optional] List of assertions. Retrieves from EARL if empty (performance reducing if empty).
addnamespace -- [Optional] If the RDF namespace should be added to the RDF. True as default.
contentlength -- [Optional] HTTP header content length
calculatedcontentlength -- Calculated Content length
type -- [Optional] Type of content to get EARL from. Assumes HTML if left empty. Possibilities: text/html and text/css.
returns RDF
"""
if not preparseEARL and not assertions:
raise IllegalCombinationError('Must ether provide a list of assertions or the possibility to preparse the EARL. preparsedEARL must be True or Assertions must be filled with a list of strings')
#if md5.new(EARL).hexdigest() in assertionlist.keys():
# assertions,testsubject,metadata = assertionlist[md5.new(EARL).hexdigest()]
# preparseEARL = False
if addnamespace:
completeEARL = rdfbegin + EARL + rdfend
else:
completeEARL = EARL
if preparseEARL:
assertions = ['http://www.eiao.net/rdf/2.0/#'+s[s.find('"')+1:s.rfind('"')] for s in regassertion.findall(completeEARL)]
metadata = getearlmetadata(completeEARL)
try:
testsubject = getearltestsubject(completeEARL)
except IndexError:
#If IndexError is raised, the EARL does not contain correct testsubject
testsubject = None
raise EmptyEARLError(EARL,'')
#assertionlist[md5.new(EARL).hexdigest()] = (assertions,testsubject,metadata)
if not EARL:
raise EmptyEARLError(EARL,'')
emptyEARL = EARL
for i in string.whitespace:
emptyEARL = emptyEARL.replace(i,'')
if not emptyEARL:
raise EmptyEARLError(EARL,'')
else:
del emptyEARL
if not pagesurvey:
pagesurvey = urlrep.getCurrentPageSurvey()
if (not assertions) and (not type=='text/css'):
raise NoAssertionsPresentEARLError(EARL,'')
#TODO: Make sure these values are correct. Should be done with testing.
if not unavailablecounter:
unavailablecounter = urlrep.getUnavailablecount(URL)
if not uncheckablecounter:
uncheckablecounter = urlrep.getUncheckabilitycount(URL)
#Add earl:TestSubject Data to this part of the graph
earltestsubject = ''%(testsubject)
if uncheckablecounter:
earltestsubject += ' %d'%(int(uncheckablecounter))
else:
earltestsubject += ' 0'
if unavailablecounter:
earltestsubject += ' %d'%(int(unavailablecounter))
else:
earltestsubject += ' 0'
if contentlength:
earltestsubject += ' %d'%(int(contentlength))
else:
earltestsubject += ' 0'
earltestsubject += ' %d'%(int(calculatedcontentlength))
earltestsubject += ''
result = EARL
result = result.replace('','')
result += '' % (pagesurvey)
#Connecting the testsubjects to the pagesurvey. (This is needed for it to be connected to the RDF graph and retrievable for the ETL).
result += '' %(testsubject)
#Connecting the assertions to the pagesurvey (This is needed for it to be connected to the RDF graph and retrievable for the ETL).
for assertion in assertions:
result += '' %(assertion)
for mdata in metadata:
result += '' %(mdata)
result += ''
result += earltestsubject
if not addnamespace:
result += ''
return result