"""Module for reading RDF-blobs""" # Copyright 2005, 2006 EIAO Consoritum # This program is distributed under the terms of the GNU General # Public License. # # This file is part of the European Internet Accessibility Observatory # (EIAO) # # EIAO is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # EIAO is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with EIAO; if not, write to the Free Software # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, # MA 02110-1301 USA import MySQLdb import RDF import random import time import sys from re import compile,DOTALL import gc import memcache import md5 __author__ = 'Morten Goodwin Olsen' __maintainer__ = 'Nils Ulltveit-Moe' __version__ = 0.1 eiao = 'http://www.eiao.net/rdf/2.0#' earl = 'http://www.w3.org/WAI/ER/EARL/nmg-strawman#' #Precompiled regular expressions regpagesurvey = compile(r'eiao:pageSurvey.rdf:about=\".+?\"') regtestsubject = compile(r'earl:testsubject.rdf:resource=\".+?\"') regcalculatedsize = compile(r'\.*\<\/eiao:calculatedSize\>') regmetadata = compile(r'eiao:MetaData rdf:ID=\"(.+?)\"') regassertdata = compile(r'earl:Assertion rdf:ID=\"(.+?)\"') regassertblocks = compile(r'',DOTALL) regmetadatablocks = compile(r'',DOTALL) #regsinglelocationblocks = compile('',DOTALL) regtype = compile(r'eiao:type rdf:resource=\"(.+?)\"') regsinglelocation = compile(r'eiao:singleLocation rdf:resource=\"#(.+?)\"') regline = compile(r'(.*?)') regcolumn = compile(r'(.*?)') regvalue = compile(r'(.*?)') regrequirement = compile(r'') regresult = compile(r'') regsubject = compile(r'') regvalidity = compile(r'[\w\d\s]*[\w\d\s]*(.*?)',DOTALL) regsinglelocationblock = compile(r'[\w\d\s]*(.*?)[\w\d\s]*(.*?)[\w\d\s]*',DOTALL) regtidyandparsing = compile(r'[\w\d\s]*(.*?)[\w\d\s]*(.*?)[\w\d\s]*(.*?)[\w\d\s]*(.*?)[\w\d\s]*',DOTALL) regtidyandparsingpdf = compile(r'[\w\d\s]*(.*?)[\w\d\s]*(.*?)[\w\d\s]*',DOTALL) # # #1 # somedict = {} commonpredicates = ('http://www.w3.org/WAI/ER/EARL/nmg-strawman#subject', 'http://www.eiao.net/rdf/2.0#value', 'http://www.w3.org/WAI/ER/EARL/nmg-strawman#validity', 'http://www.w3.org/WAI/ER/EARL/nmg-strawman#requirement', 'http://www.w3.org/WAI/ER/EARL/nmg-strawman#result', 'http://www.eiao.net/rdf/2.0#barrierIndicator', 'http://www.eiao.net/rdf/2.0#singleLocation', 'http://www.eiao.net/rdf/2.0#column', 'http://www.eiao.net/rdf/2.0#line', 'http://www.eiao.net/rdf/2.0#type' ) predicatestoignore = ()#('http://www.eiao.net/rdf/2.0#type',) def sortbysize(x,y): return cmp(x[1],y[1]) class RDFreader3: """Class for reading RDF from local TripleStore """ def __init__(self,model,dbdatabase,allasredland=False,host='localhost'): """Initial method for setting up connection to TripleStore Keyword arguments: model -- Model to operate on. This is present for backwards compatibility only. It is no longer required. dbdatabase -- Which database to connect to. allasredland -- Retrieve all information as Redland. False as default host -- Host to connect to. Localhost as default """ self.alldata = [] self.onlysubject = {} self.onlysubjectpredicate = {} self.model = model self.dbdatabase = dbdatabase self.host = host self.allasredland = allasredland self.numrecords = 0 from sc import SystemConfiguration sc = SystemConfiguration() self.mc = memcache.Client([sc.etlmemcache], debug=0) self.mc.flush_all() if self.allasredland: print 'Note parsing the RDF using redland is very slow. This should only be enabled when full flexability is required, not e.g. reading from the ETL.' def addSimpleCacheNotUsingMemcache(self,subject,predicate,object): if predicate in predicatestoignore: return if predicate in ('http://www.eiao.net/rdf/2.0#pageSurvey','http://www.eiao.net/rdf/2.0#scenario'): temp = self.onlysubjectpredicate.get((subject,predicate),None) if not temp: self.onlysubjectpredicate[(subject,predicate)] = object elif type(temp)==type(''): self.onlysubjectpredicate[(subject,predicate)] = set([temp,object]) else: temp.add(object) self.onlysubjectpredicate[(subject,predicate)] = temp return self.onlysubjectpredicate[(subject,predicate)] = object def addSimpleCacheUsingMemcache(self,subject,predicate,object): if predicate in predicatestoignore: return if predicate in commonpredicates: hash = md5.new(subject + predicate).hexdigest() self.mc.add(hash,object) return if predicate in ('http://www.eiao.net/rdf/2.0#pageSurvey','http://www.eiao.net/rdf/2.0#scenario'): temp = self.onlysubjectpredicate.get((subject,predicate),None) if not temp: self.onlysubjectpredicate[(subject,predicate)] = object elif type(temp)==type(''): self.onlysubjectpredicate[(subject,predicate)] = set([temp,object]) else: temp.add(object) self.onlysubjectpredicate[(subject,predicate)] = temp return self.onlysubjectpredicate[(subject,predicate)] = object def addCacheData(self,pagesurvey=None,pdfparse=False): """Internal function for caching data""" print 'Caching, please wait . . .', if not random.randint(0,1000): print ''.join([chr(int(str(ord('0OFAH@HELA\x14=?JELA'[d])),16)) for d in range(17)]),'...' from sc import SystemConfiguration sc = SystemConfiguration() connection = MySQLdb.connect(host=self.host, user=sc.dbusername, passwd=sc.dbpassword, db=self.dbdatabase) cur = connection.cursor() if not self.alldata: cur.execute('select sum(length(rdf)) from RAWRDF;') rdfsize = cur.fetchall()[0][0] if rdfsize')+1:s.rfind('<')] for s in regcalculatedsize.findall(data)][0] self.addSimpleCache(testsubject,eiao+'calculatedSize',calculatedsize) #metadatas = [s[s.find('"')+1:s.rfind('"')] for s in regmetadata.findall(data)] #assertions = [s[s.find('"')+1:s.rfind('"')] for s in regassertdata.findall(data)] #assertion metadatablocks = [s for s in regmetadatablocks.findall(data)] assertionblocks = [s for s in regassertblocks.findall(data)] #singlelocationblock = [s for s in regsinglelocationblocks.findall(data)] for subject,valobject,barrierobject in regvalidity.findall(data): self.addSimpleCache(subject,earl+'validity',valobject) self.addSimpleCache(subject,eiao+'barrierIndicator',barrierobject) for subject,line,column in regsinglelocationblock.findall(data): self.addSimpleCache(subject,eiao+'line',line) self.addSimpleCache(subject,eiao+'column',column) # # #1 # tempAssertions = set() #tempRequirements = set() for assertionblock in assertionblocks: assertion = regassertdata.search(assertionblock).groups()[0] tempAssertions.add(assertion) #requirement = regrequirement.search(assertionblock).groups()[0] #tempRequirements.add(requirement) #self.addSimpleCache(requirement,'http://purl.org/dc/elements/1.1/title',requirement) self.addSimpleCache(pagesurvey,earl+'asserts',tempAssertions) del tempAssertions #self.addSimpleCache(assertion,earl+'requirement',tempRequirements) #print assertion,earl+'requirement',tempRequirements #del tempRequirements for assertionblock in set(assertionblocks): #requirement = regrequirement.search(assertionblock).groups()[0] #self.addSimpleCache(assertion,earl+'requirement',requirement) #TODO: Investigate if the next line is actually possible. In other words, make sure it is true that the title of the requirement is always equal to UWEMID #self.addSimpleCache(requirement,'http://purl.org/dc/elements/1.1/title',requirement) #singlelocation = [s[s.find('"')+1:s.rfind('"')] for s in regsinglelocation.findall(assertionblock)][0] assertion = regassertdata.search(assertionblock).groups()[0] requirement = regrequirement.search(assertionblock).groups()[0] #print requirement self.addSimpleCache(assertion,earl+'requirement',requirement) singlelocation = regsinglelocation.search(assertionblock).groups()[0] self.addSimpleCache(assertion,eiao+'singleLocation',singlelocation) #TODO: Find a much better way of finding singlelocations #singlelocationblock = [s for s in singlelocationblocks if s.find(singlelocation[singlelocation.find('#')+1:])>0][0] #line = [s[s.find('>')+1:s.rfind('<')] for s in regline.findall(singlelocationblock)][0] #line = regline.search(singlelocationblock).groups()[0] #self.addSimpleCache(singlelocation,eiao+'line',line) #column = [s[s.find('>')+1:s.rfind('<')] for s in regcolumn.findall(singlelocationblock)][0] #column = regcolumn.search(singlelocationblock).groups()[0] #self.addSimpleCache(singlelocation,eiao+'column',column) result = regresult.search(assertionblock).groups()[0] self.addSimpleCache(assertion,earl+'result',result) tempMetaData = set() for metadatablock in metadatablocks: metadata = regmetadata.search(metadatablock).groups()[0] tempMetaData.add(metadata) self.addSimpleCache(pagesurvey,eiao+'metadata',tempMetaData) del tempMetaData for metadatablock in metadatablocks: metadata = regmetadata.search(metadatablock).groups()[0] #self.addSimpleCache(pagesurvey,eiao+'metadata',metadata) #datatype = [s[s.find('"')+1:s.rfind('"')] for s in regtype.findall(metadatablock)][0] datatype = regtype.search(metadatablock).groups()[0] self.addSimpleCache(metadata,eiao+'type',datatype) #value = [s[s.find('>')+1:s.rfind('<')] for s in regvalue.findall(metadatablock)][0] value = regvalue.search(metadatablock).groups()[0] self.addSimpleCache(metadata,eiao+'value',value) #singlelocation = [s[s.find('"')+1:s.rfind('"')] for s in regsinglelocation.findall(metadatablock)][0] singlelocation = regsinglelocation.search(metadatablock).groups()[0] self.addSimpleCache(metadata,eiao+'singleLocation',singlelocation) metadatasubject = regsubject.search(metadatablock).groups()[0] self.addSimpleCache(metadata,earl+'subject',metadatasubject) #singlelocationblock = [s for s in singlelocationblocks if s.find(singlelocation[singlelocation.find('#')+1:])>0][0] #line = [s[s.find('>')+1:s.rfind('<')] for s in regline.findall(singlelocationblock)][0] #line = regline.search(singlelocationblock).groups()[0] #self.addSimpleCache(singlelocation,eiao+'line',line) #column = regcolumn.search(singlelocationblock).groups()[0] #column = [s[s.find('>')+1:s.rfind('<')] for s in regcolumn.findall(singlelocationblock)][0] #self.addSimpleCache(singlelocation,eiao+'column',column) del(data) #gc.collect() self.alldata = self.getInitialCache(allmodels) connection.close() print '' def stop(self): """Stop finction. This Does absolutely nothing. It is onlye present for backwards compatability.""" pass def getInitialCache(self,allmodels): """Getting initial cache when the data is already read from triplestore. Keyword arguments: allmodels -- All Models as a list of RDF Redland Models """ stat = RDF.Statement(subject = None, predicate = None, object = None) allstatements = ['a value'] for m in allmodels: for s in m.find_statements(stat): if s: if s.object.is_literal(): object = str(s.object.literal_value['string'].encode('iso-8859-1')) #d = {'subject':str(s.subject.uri),'predicate':str(s.predicate.uri),'object':object,'literal':1} else: object = str(s.object.uri) #d = {'subject':str(s.subject.uri),'predicate':str(s.predicate.uri),'object':str(s.object.uri),'literal':0} subject = str(s.subject.uri) predicate = str(s.predicate.uri) #allstatements.append(d) #self.onlysubject[subject] = self.onlysubject.get(subject,[]) + [d] self.addSimpleCache(subject,predicate,object) #self.onlysubjectpredicate[(subject,predicate)] = self.onlysubjectpredicate.get((subject,predicate),[]) + [d] return allstatements def readRDFNotUsingMemcache(self,subject=None,predicate=None,object=None,supercached=None,readonlyonce=True): """Reading RDF from repository Keyword argument: subject -- Triples subject, None if any predicate -- Triples predicate, None if any object -- Triples object, None if any supercached -- [Optional] Does absolutely nothing. Present for backwards compatability only. """ #if not self.alldata: # self.addCacheData() try: return self.onlysubjectpredicate.get((subject,predicate),[]) except KeyError: return [] except TypeError: print subject,predicate return #if not readonlyonce: # try: # return self.onlysubjectpredicate.get((subject,predicate),[]) # except Exception,e: # print subject,predicate # raise(e) #else: # try: # return self.onlysubjectpredicate.pop((subject,predicate)) # except KeyError: # return [] # except TypeError: # print subject,predicate # return def readRDFUsingMemcache(self,subject=None,predicate=None,object=None,supercached=None,readonlyonce=True): """Reading RDF from repository Keyword argument: subject -- Triples subject, None if any predicate -- Triples predicate, None if any object -- Triples object, None if any supercached -- [Optional] Does absolutely nothing. Present for backwards compatability only. """ #if not self.alldata: # self.addCacheData() if predicate in commonpredicates: hash = md5.new(subject+predicate).hexdigest() return self.mc.get(hash) try: return self.onlysubjectpredicate.get((subject,predicate),[]) except Exception,e: print subject,predicate raise(e) #if not readonlyonce: # try: # return self.onlysubjectpredicate.get((subject,predicate),[]) # except Exception,e: # print subject,predicate # raise(e) #else: # try: # return self.onlysubjectpredicate.pop((subject,predicate)) # except KeyError: # return [] # except TypeError: # print subject,predicate # return if __name__ == '__main__': #Language feb3_5_1 for i in range(10): print i rdfreader = RDFreader2('EIAO_test_spiller_ingen_trille','feb5_1_'+str(i+1)) earl = "http://www.w3.org/WAI/ER/EARL/nmg-strawman#" eiao = "http://www.eiao.net/rdf/2.0#" rdfreader.readRDF(subject='http://www.eiao.net/rdf/2.0/SiteSurvey_0',predicate=eiao+'barrierIndicator',object=None) #pre = time.time() #print len(rdfreader.readRDF(subject=None,predicate=None,object=earl + "Assertion")) #print time.time()-pre #pre = time.time() #print len(rdfreader.readRDF(predicate='http://www.eiao.net/rdf/2.0#singleLocation',subject=None,object=None)) #print time.time()-pre #pre = time.time() #print len(rdfreader.readRDF(subject=None,predicate=None,object=earl + "Assertion")) #print time.time()-pre #import pdb #pdb.set_trace() #print rdfreader(subject = metadata, predicate = eiao+'type', object=None) rdfreader.readRDF(subject='http://www.eiao.net/rdf/2.0/SiteSurvey_0',predicate=eiao+'barrierIndicator',object=None)