#!/usr/bin/python

import argparse as ap
import os
import re
import sys

'''
To evaluate a file
    check_car.py run1 -q test.benchmarkY1test.titles
'''


VALID_TAG = re.compile('\W{1,15}')

def error(msg):
    print msg + "\n"  
    sys.exit(255)

def CheckRun(run_file, query_list, is_paragraph_run):
        # Load the run
        line_count = 1
        first_run_tag = None
        results = {}
        for line in open(run_file, 'r'):
            split = line.strip().split(' ')
            if len(split) !=6:
                error('Line ('+str(line_count)+') '+run_file+\
                        ': format incorrect. Expected format: qid Q0 doc_id rank score run_tag. Found '+line)
            query = split[0].strip()
            dummy = split[1]
            doc_id = split[2].strip()
            doc_rank = split[3]
            doc_score = split[4]
            run_tag = split[5].strip() 
            # topic name.
	    # Sections begin after '/'. Format Query_topic/Section_name
            if query not in query_list:
                error('Line ('+str(line_count)+') '+run_file+
                        ': incorrect query id (either section or article not correct. Query topic '+ query +' does not match test set.')
            
            if query not in results:
                results[query] = {}

            # run name.  
            if VALID_TAG.match(run_tag):
                error('Line ('+str(line_count)+') '+run_file+\
                        ': malformed run tag('+ run_tag+'). Special characters as run-tag not allowed.')
            
            if not first_run_tag:
                first_run_tag = run_tag

            if run_tag!=first_run_tag:
                raise error('Line ('+str(line_count)+') '+run_file+\
                        ': malformed run tag('+run_tag+'). Only one run_tag allowed per run file.')
           
            # dummy variable
            if dummy!='Q0':
                raise error('Line ('+str(line_count)+') '+run_file+\
                        ':  malformed second column. Column 2 is '+dummy+' not Q0.')

            # Rank position.
            try:
                doc_rank = int(doc_rank)
            except ValueError as ex:
                print ex
                error('Line ('+str(line_count)+') '+run_file+\
                        ': malformed document rank. Column 4 is '+doc_rank+' not a valid integer.')

           # Doc hash has to be of 40 characters.
            if is_paragraph_run:
                doc_name = None
                if '/' in doc_id:
                    doc_name = doc_id[:doc_id.find('/')]
                else:
                    doc_name = doc_id
                #if len(doc_name) > 40:
                if re.match("^([0-9a-f]{40})$", doc_name).group(1) is None:
                    error('Line ('+str(line_count)+') '+run_file+\
                            ': malformed document id. Column 3 is '+doc_id+' not a valid document hash.')
            else:
                (docname, entityid) =split_doc_entity(doc_id.strip())
                if docname == None and entityid == None:
                    error('Line ('+str(line_count)+') '+run_file+\
                        ': malformed document id/entity id. Column 3 is '+doc_id+' but needs to be either documentId/entityId or just entity id. Remember to preserve the namespace for entity ids.')
                elif docname is not None and len(docname)>40:
                    error('Line ('+str(line_count)+') '+run_file+\
                         ': malformed document id. Column 3 is '+doc_id+', document id is not a valid document hash.')
                elif entityid is not None and re.search("([0-9a-f]{40})",entityid) is not None:
                        error('Line ('+str(line_count)+') '+run_file+\
                                         ': malformed entity id '+entityid+' contains a document id. Column 3 is '+doc_id+'.')


                #if '/' in doc_id:
                #    doc_name = doc_id[:doc_id.find('/')]
                #    entity_id = doc_id[doc_id.find('/')+1:]

                #    if len(doc_name) > 40:
                #           error('Line ('+str(line_count)+') '+run_file+\
                #          ': malformed document id. Column 3 is '+doc_id+' not a valid document hash.')
                #else:
                #    doc_name = None
               #     entity_id = doc_id

               # if len(entity_id.strip())==0 or ':' not in entity_id:
               #     error('Line ('+str(line_count)+') '+run_file+\
               #     ': missing entity id. Column 3 is '+doc_id+' does not contain an entity id.')        

            # Document id is not repeated. 
            if doc_id in results[query]:
                error ('Line ('+str(line_count)+') '+run_file+\
                        ': duplicate document id. Document id '+doc_id+' repeats for '+\
                        query+', previously appearing on line no '+\
                        str(results[query][doc_id])+'.')

            results[query][doc_id] = line_count
            line_count+=1

        # Check the number of documents.
        for query in query_list:
          if query not in results:
             raise error('Results for query ('+query+') absent from '+run_file+\
                        '. Each query must have some results.')

def split_doc_entity(text):
    matchobj = re.match("^([0-9a-f]{40})/([a-zA-Z]+:.+)$", text)
    if matchobj is not None:
        docid = matchobj.group(1)
        entityid = matchobj.group(2)
        return (docid, entityid)
    else:
        matchobj = re.match("^([a-zA-Z]+:.+)$", text)
        if matchobj is not None:
            entityid = matchobj.group(1)
            return (None, entityid)
        else:
            return (None, None)

def main():
    parser = ap.ArgumentParser(description='Evaluate TREC CAR submissions for errors.')
    parser.add_argument('task', help='Run task.')
    parser.add_argument('runFile',help='File containing the run. Expected format: qid Q0 doc_id rank score run_tag')

    args = parser.parse_args()

    if (not args.runFile) :
        raise ValueError('File for run must be provided. Please provide correct path for run.')

    if args.runFile  and (not os.path.exists(args.runFile)):
        raise ValueError('File does not exist. Please provide correct path for runs.')

    #queryFile = "/trec/trec26/car/data/test.benchmarkY1test.topics";
    #queryFile = "/runs/aux/car.topics";
    queryFile = "car.topics";
    if (not os.path.exists(queryFile)):
        raise ValueError('File with queries does not exist. Please provide correct path for queries.')
   
    isPassage = False
    if (args.task == 'passages'):
	isPassage = True

    query_list = []
    for line in open(queryFile,'r'):
        query_name = line.strip()
        query_list.append(query_name) 

    if args.runFile:
        # Send the file directly. 
        CheckRun(args.runFile, query_list, isPassage)
    sys.exit(0)


if __name__ == '__main__':
    main()