#!/usr/bin/python import argparse as ap import os import re import sys ''' To evaluate a file check_car.py run1 -q test.benchmarkY1test.titles ''' VALID_TAG = re.compile('\W{1,15}') def error(msg): print msg + "\n" sys.exit(255) def CheckRun(run_file, query_list, is_paragraph_run): # Load the run line_count = 1 first_run_tag = None results = {} for line in open(run_file, 'r'): split = line.strip().split(' ') if len(split) !=6: error('Line ('+str(line_count)+') '+run_file+\ ': format incorrect. Expected format: qid Q0 doc_id rank score run_tag. Found '+line) query = split[0].strip() dummy = split[1] doc_id = split[2].strip() doc_rank = split[3] doc_score = split[4] run_tag = split[5].strip() # topic name. # Sections begin after '/'. Format Query_topic/Section_name if query not in query_list: error('Line ('+str(line_count)+') '+run_file+ ': incorrect query id (either section or article not correct. Query topic '+ query +' does not match test set.') if query not in results: results[query] = {} # run name. if VALID_TAG.match(run_tag): error('Line ('+str(line_count)+') '+run_file+\ ': malformed run tag('+ run_tag+'). Special characters as run-tag not allowed.') if not first_run_tag: first_run_tag = run_tag if run_tag!=first_run_tag: raise error('Line ('+str(line_count)+') '+run_file+\ ': malformed run tag('+run_tag+'). Only one run_tag allowed per run file.') # dummy variable if dummy!='Q0': raise error('Line ('+str(line_count)+') '+run_file+\ ': malformed second column. Column 2 is '+dummy+' not Q0.') # Rank position. try: doc_rank = int(doc_rank) except ValueError as ex: print ex error('Line ('+str(line_count)+') '+run_file+\ ': malformed document rank. Column 4 is '+doc_rank+' not a valid integer.') # Doc hash has to be of 40 characters. if is_paragraph_run: doc_name = None if '/' in doc_id: doc_name = doc_id[:doc_id.find('/')] else: doc_name = doc_id #if len(doc_name) > 40: if re.match("^([0-9a-f]{40})$", doc_name).group(1) is None: error('Line ('+str(line_count)+') '+run_file+\ ': malformed document id. Column 3 is '+doc_id+' not a valid document hash.') else: (docname, entityid) =split_doc_entity(doc_id.strip()) if docname == None and entityid == None: error('Line ('+str(line_count)+') '+run_file+\ ': malformed document id/entity id. Column 3 is '+doc_id+' but needs to be either documentId/entityId or just entity id. Remember to preserve the namespace for entity ids.') elif docname is not None and len(docname)>40: error('Line ('+str(line_count)+') '+run_file+\ ': malformed document id. Column 3 is '+doc_id+', document id is not a valid document hash.') elif entityid is not None and re.search("([0-9a-f]{40})",entityid) is not None: error('Line ('+str(line_count)+') '+run_file+\ ': malformed entity id '+entityid+' contains a document id. Column 3 is '+doc_id+'.') #if '/' in doc_id: # doc_name = doc_id[:doc_id.find('/')] # entity_id = doc_id[doc_id.find('/')+1:] # if len(doc_name) > 40: # error('Line ('+str(line_count)+') '+run_file+\ # ': malformed document id. Column 3 is '+doc_id+' not a valid document hash.') #else: # doc_name = None # entity_id = doc_id # if len(entity_id.strip())==0 or ':' not in entity_id: # error('Line ('+str(line_count)+') '+run_file+\ # ': missing entity id. Column 3 is '+doc_id+' does not contain an entity id.') # Document id is not repeated. if doc_id in results[query]: error ('Line ('+str(line_count)+') '+run_file+\ ': duplicate document id. Document id '+doc_id+' repeats for '+\ query+', previously appearing on line no '+\ str(results[query][doc_id])+'.') results[query][doc_id] = line_count line_count+=1 # Check the number of documents. for query in query_list: if query not in results: raise error('Results for query ('+query+') absent from '+run_file+\ '. Each query must have some results.') def split_doc_entity(text): matchobj = re.match("^([0-9a-f]{40})/([a-zA-Z]+:.+)$", text) if matchobj is not None: docid = matchobj.group(1) entityid = matchobj.group(2) return (docid, entityid) else: matchobj = re.match("^([a-zA-Z]+:.+)$", text) if matchobj is not None: entityid = matchobj.group(1) return (None, entityid) else: return (None, None) def main(): parser = ap.ArgumentParser(description='Evaluate TREC CAR submissions for errors.') parser.add_argument('task', help='Run task.') parser.add_argument('runFile',help='File containing the run. Expected format: qid Q0 doc_id rank score run_tag') args = parser.parse_args() if (not args.runFile) : raise ValueError('File for run must be provided. Please provide correct path for run.') if args.runFile and (not os.path.exists(args.runFile)): raise ValueError('File does not exist. Please provide correct path for runs.') #queryFile = "/trec/trec26/car/data/test.benchmarkY1test.topics"; #queryFile = "/runs/aux/car.topics"; queryFile = "car.topics"; if (not os.path.exists(queryFile)): raise ValueError('File with queries does not exist. Please provide correct path for queries.') isPassage = False if (args.task == 'passages'): isPassage = True query_list = [] for line in open(queryFile,'r'): query_name = line.strip() query_list.append(query_name) if args.runFile: # Send the file directly. CheckRun(args.runFile, query_list, isPassage) sys.exit(0) if __name__ == '__main__': main()