import argparse import xml.etree.cElementTree as etree import codecs from six.moves.html_parser import HTMLParser import re import tempfile import bioc import pymarc import shutil import six import unicodedata import calendar import json # Remove empty brackets (that could happen if the contents have been removed already # e.g. for citation ( [3] [4] ) -> ( ) -> nothing def removeBracketsWithoutWords(text): fixed = re.sub(r'\([\W\s]*\)', ' ', text) fixed = re.sub(r'\[[\W\s]*\]', ' ', fixed) fixed = re.sub(r'\{[\W\s]*\}', ' ', fixed) return fixed # Some older articles have titles like "[A study of ...]." # This removes the brackets while retaining the full stop def removeWeirdBracketsFromOldTitles(titleText): titleText = titleText.strip() if titleText[0] == '[' and titleText[-2:] == '].': titleText = titleText[1:-2] + '.' return titleText def cleanupText(text): # Remove some "control-like" characters (left/right separator) text = text.replace(u'\u2028',' ').replace(u'\u2029',' ') text = "".join(ch for ch in text if unicodedata.category(ch)[0]!="C") text = "".join(ch if unicodedata.category(ch)[0]!="Z" else " " for ch in text) # Remove repeated commands and commas next to periods text = re.sub(',(\s*,)*',',',text) text = re.sub('(,\s*)*\.','.',text) return text.strip() # Unescape HTML special characters e.g. > is changed to > htmlParser = HTMLParser() def htmlUnescape(text): return htmlParser.unescape(text) # XML elements to ignore the contents of ignoreList = ['table', 'table-wrap', 'xref', 'disp-formula', 'inline-formula', 'ref-list', 'bio', 'ack', 'graphic', 'media', 'tex-math', 'mml:math', 'object-id', 'ext-link'] # XML elements to separate text between separationList = ['title', 'p', 'sec', 'break', 'def-item', 'list-item', 'caption'] def extractTextFromElem(elem): # Extract any raw text directly in XML element or just after head = "" if elem.text: head = elem.text tail = "" if elem.tail: tail = elem.tail # Then get the text from all child XML nodes recursively childText = [] for child in elem: childText = childText + extractTextFromElem(child) # Check if the tag should be ignore (so don't use main contents) if elem.tag in ignoreList: return [tail.strip()] # Add a zero delimiter if it should be separated elif elem.tag in separationList: return [0] + [head] + childText + [tail] # Or just use the whole text else: return [head] + childText + [tail] # Merge a list of extracted text blocks and deal with the zero delimiter def extractTextFromElemList_merge(list): textList = [] current = "" # Basically merge a list of text, except separate into a new list # whenever a zero appears for t in list: if t == 0: # Zero delimiter so split if len(current) > 0: textList.append(current) current = "" else: # Just keep adding current = current + " " + t current = current.strip() if len(current) > 0: textList.append(current) return textList # Main function that extracts text from XML element or list of XML elements def extractTextFromElemList(elemList): textList = [] # Extracts text and adds delimiters (so text is accidentally merged later) if isinstance(elemList, list): for e in elemList: textList = textList + extractTextFromElem(e) + [0] else: textList = extractTextFromElem(elemList) + [0] # Merge text blocks with awareness of zero delimiters mergedList = extractTextFromElemList_merge(textList) # Remove any newlines (as they can be trusted to be syntactically important) mergedList = [ text.replace('\n', ' ') for text in mergedList ] # Remove no-break spaces mergedList = [ cleanupText(text) for text in mergedList ] return mergedList def getMetaInfoForPMCArticle(articleElem): monthMapping = {} for i,m in enumerate(calendar.month_name): monthMapping[m] = i for i,m in enumerate(calendar.month_abbr): monthMapping[m] = i # Attempt to extract the PubMed ID, PubMed Central IDs and DOIs pmidText = '' pmcidText = '' doiText = '' article_id = articleElem.findall('./front/article-meta/article-id') + articleElem.findall('./front-stub/article-id') for a in article_id: if a.text and 'pub-id-type' in a.attrib and a.attrib['pub-id-type'] == 'pmid': pmidText = a.text.strip().replace('\n',' ') if a.text and 'pub-id-type' in a.attrib and a.attrib['pub-id-type'] == 'pmc': pmcidText = a.text.strip().replace('\n',' ') if a.text and 'pub-id-type' in a.attrib and a.attrib['pub-id-type'] == 'doi': doiText = a.text.strip().replace('\n',' ') # Attempt to get the publication date pubdates = articleElem.findall('./front/article-meta/pub-date') + articleElem.findall('./front-stub/pub-date') pubYear,pubMonth,pubDay = None,None,None if len(pubdates) >= 1: mostComplete,completeness = None,0 for pubdate in pubdates: pubYear_Field = pubdate.find("./year") if not pubYear_Field is None: pubYear = pubYear_Field.text.strip().replace('\n',' ') pubSeason_Field = pubdate.find("./season") if not pubSeason_Field is None: pubSeason = pubSeason_Field.text.strip().replace('\n',' ') monthSearch = [ c for c in (list(calendar.month_name) + list(calendar.month_abbr)) if c != '' and c in pubSeason ] if len(monthSearch) > 0: pubMonth = monthMapping[monthSearch[0]] pubMonth_Field = pubdate.find("./month") if not pubMonth_Field is None: pubMonth = pubMonth_Field.text.strip().replace('\n',' ') pubDay_Field = pubdate.find("./day") if not pubDay_Field is None: pubDay = pubDay_Field.text.strip().replace('\n',' ') thisCompleteness = sum(not x is None for x in [pubYear,pubMonth,pubDay]) if thisCompleteness > completeness: mostComplete = pubYear,pubMonth,pubDay pubYear,pubMonth,pubDay = mostComplete journal = articleElem.findall('./front/journal-meta/journal-title') + articleElem.findall('./front/journal-meta/journal-title-group/journal-title') + articleElem.findall('./front-stub/journal-title-group/journal-title') assert len(journal) <= 1 journalText = " ".join(extractTextFromElemList(journal)) journalISOText = '' journalISO = articleElem.findall('./front/journal-meta/journal-id') + articleElem.findall('./front-stub/journal-id') for field in journalISO: if 'journal-id-type' in field.attrib and field.attrib['journal-id-type'] == "iso-abbrev": journalISOText = field.text return pmidText,pmcidText,doiText,pubYear,pubMonth,pubDay,journalText,journalISOText def getJournalDateForMedlineFile(elem,pmid): yearRegex = re.compile(r'(18|19|20)\d\d') monthMapping = {} for i,m in enumerate(calendar.month_name): monthMapping[m] = i for i,m in enumerate(calendar.month_abbr): monthMapping[m] = i # Try to extract the publication date pubDateField = elem.find('./MedlineCitation/Article/Journal/JournalIssue/PubDate') medlineDateField = elem.find('./MedlineCitation/Article/Journal/JournalIssue/PubDate/MedlineDate') assert not pubDateField is None, "Couldn't find PubDate field for PMID=%s" % pmid medlineDateField = pubDateField.find('./MedlineDate') pubDateField_Year = pubDateField.find('./Year') pubDateField_Month = pubDateField.find('./Month') pubDateField_Day = pubDateField.find('./Day') pubYear,pubMonth,pubDay = None,None,None if not medlineDateField is None: regexSearch = re.search(yearRegex,medlineDateField.text) if regexSearch: pubYear = regexSearch.group() monthSearch = [ c for c in (list(calendar.month_name) + list(calendar.month_abbr)) if c != '' and c in medlineDateField.text ] if len(monthSearch) > 0: pubMonth = monthSearch[0] else: if not pubDateField_Year is None: pubYear = pubDateField_Year.text if not pubDateField_Month is None: pubMonth = pubDateField_Month.text if not pubDateField_Day is None: pubDay = pubDateField_Day.text if not pubYear is None: pubYear = int(pubYear) if not (pubYear > 1700 and pubYear < 2100): pubYear = None if not pubMonth is None: if pubMonth in monthMapping: pubMonth = monthMapping[pubMonth] pubMonth = int(pubMonth) if not pubDay is None: pubDay = int(pubDay) return pubYear,pubMonth,pubDay def getPubmedEntryDate(elem,pmid): pubDateFields = elem.findall('./PubmedData/History/PubMedPubDate') allDates = {} for pubDateField in pubDateFields: assert 'PubStatus' in pubDateField.attrib #if 'PubStatus' in pubDateField.attrib and pubDateField.attrib['PubStatus'] == "pubmed": pubDateField_Year = pubDateField.find('./Year') pubDateField_Month = pubDateField.find('./Month') pubDateField_Day = pubDateField.find('./Day') pubYear = int(pubDateField_Year.text) pubMonth = int(pubDateField_Month.text) pubDay = int(pubDateField_Day.text) dateType = pubDateField.attrib['PubStatus'] if pubYear > 1700 and pubYear < 2100: allDates[dateType] = (pubYear,pubMonth,pubDay) if len(allDates) == 0: return None,None,None if 'pubmed' in allDates: pubYear,pubMonth,pubDay = allDates['pubmed'] elif 'entrez' in allDates: pubYear,pubMonth,pubDay = allDates['entrez'] elif 'medline' in allDates: pubYear,pubMonth,pubDay = allDates['medline'] else: pubYear,pubMonth,pubDay = list(allDates.values())[0] return pubYear,pubMonth,pubDay def processMedlineFile(pubmedFile): for event, elem in etree.iterparse(pubmedFile, events=('start', 'end', 'start-ns', 'end-ns')): if (event=='end' and elem.tag=='PubmedArticle'): #MedlineCitation'): # Try to extract the pmidID pmidField = elem.find('./MedlineCitation/PMID') assert not pmidField is None pmid = pmidField.text journalYear,journalMonth,journalDay = getJournalDateForMedlineFile(elem,pmid) entryYear,entryMonth,entryDay = getPubmedEntryDate(elem,pmid) jComparison = tuple ( 9999 if d is None else d for d in [ journalYear,journalMonth,journalDay ] ) eComparison = tuple ( 9999 if d is None else d for d in [ entryYear,entryMonth,entryDay ] ) if jComparison < eComparison: # The PubMed entry has been delayed for some reason so let's try the journal data pubYear,pubMonth,pubDay = journalYear,journalMonth,journalDay else: pubYear,pubMonth,pubDay = entryYear,entryMonth,entryDay # Extract the authors authorElems = elem.findall('./MedlineCitation/Article/AuthorList/Author') authors = [] for authorElem in authorElems: forename = authorElem.find('./ForeName') lastname = authorElem.find('./LastName') collectivename = authorElem.find('./CollectiveName') name = None if forename is not None and lastname is not None and forename.text is not None and lastname.text is not None: name = "%s %s" % (forename.text, lastname.text) elif lastname is not None and lastname.text is not None: name = lastname.text elif forename is not None and forename.text is not None: name = forename.text elif collectivename is not None and collectivename.text is not None: name = collectivename.text else: raise RuntimeError("Unable to find authors in Pubmed citation (PMID=%s)" % pmid) authors.append(name) chemicals = [] chemicalElems = elem.findall('./MedlineCitation/ChemicalList/Chemical/NameOfSubstance') for chemicalElem in chemicalElems: chemID = chemicalElem.attrib['UI'] name = chemicalElem.text #chemicals.append((chemID,name)) chemicals.append("%s|%s" % (chemID,name)) chemicalsTxt = "\t".join(chemicals) meshHeadings = [] meshElems = elem.findall('./MedlineCitation/MeshHeadingList/MeshHeading') for meshElem in meshElems: descriptorElem = meshElem.find('./DescriptorName') meshID = descriptorElem.attrib['UI'] majorTopicYN = descriptorElem.attrib['MajorTopicYN'] name = descriptorElem.text #meshHeading = {'Descriptor':name,'MajorTopicYN':majorTopicYN,'ID':meshID,'Qualifiers':[]} meshHeading = "Qualifier|%s|%s|%s" % (meshID,majorTopicYN,name) qualifierElems = meshElem.findall('./QualifierName') for qualifierElem in qualifierElems: meshID = qualifierElem.attrib['UI'] majorTopicYN = qualifierElem.attrib['MajorTopicYN'] name = qualifierElem.text qualifier = {'Descriptor':name,'MajorTopicYN':majorTopicYN,'ID':meshID} #meshHeading['Qualifiers'].append(qualifier) meshHeading += "%%Descriptor|%s|%s|%s" % (meshID,majorTopicYN,name) meshHeadings.append(meshHeading) meshHeadingsTxt = "\t".join(meshHeadings) # Extract the title of paper title = elem.findall('./MedlineCitation/Article/ArticleTitle') titleText = extractTextFromElemList(title) titleText = [ removeWeirdBracketsFromOldTitles(t) for t in titleText ] titleText = [ t for t in titleText if len(t) > 0 ] titleText = [ htmlUnescape(t) for t in titleText ] titleText = [ removeBracketsWithoutWords(t) for t in titleText ] # Extract the abstract from the paper abstract = elem.findall('./MedlineCitation/Article/Abstract/AbstractText') abstractText = extractTextFromElemList(abstract) abstractText = [ t for t in abstractText if len(t) > 0 ] abstractText = [ htmlUnescape(t) for t in abstractText ] abstractText = [ removeBracketsWithoutWords(t) for t in abstractText ] journalTitleFields = elem.findall('./MedlineCitation/Article/Journal/Title') journalTitleISOFields = elem.findall('./MedlineCitation/Article/Journal/ISOAbbreviation') journalTitle = " ".join(extractTextFromElemList(journalTitleFields)) journalISOTitle = " ".join(extractTextFromElemList(journalTitleISOFields)) document = {} document["pmid"] = pmid document["pubYear"] = pubYear document["pubMonth"] = pubMonth document["pubDay"] = pubDay document["title"] = titleText document["abstract"] = abstractText document["journal"] = journalTitle document["journalISO"] = journalISOTitle document["authors"] = authors document["chemicals"] = chemicalsTxt document["meshHeadings"] = meshHeadingsTxt yield document # Important: clear the current element from memory to keep memory usage low elem.clear() def processPMCFile(pmcFile): with open(pmcFile, 'r') as openfile: # Skip to the article element in the file for event, elem in etree.iterparse(openfile, events=('start', 'end', 'start-ns', 'end-ns')): if (event=='end' and elem.tag=='article'): pmidText,pmcidText,doiText,pubYear,pubMonth,pubDay,journal,journalISO = getMetaInfoForPMCArticle(elem) # We're going to process the main article along with any subarticles # And if any of the subarticles have distinguishing IDs (e.g. PMID), then # that'll be used, otherwise the parent article IDs will be used subarticles = [elem] + elem.findall('./sub-article') for articleElem in subarticles: if articleElem == elem: # This is the main parent article. Just use its IDs subPmidText,subPmcidText,subDoiText,subPubYear,subPubMonth,subPubDay,subJournal,subJournalISO = pmidText,pmcidText,doiText,pubYear,pubMonth,pubDay,journal,journalISO else: # Check if this subarticle has any distinguishing IDs and use them instead subPmidText,subPmcidText,subDoiText,subPubYear,subPubMonth,subPubDay,subJournal,subJournalISO = getMetaInfoForPMCArticle(articleElem) if subPmidText=='' and subPmcidText == '' and subDoiText == '': subPmidText,subPmcidText,subDoiText = pmidText,pmcidText,doiText if subPubYear == None: subPubYear = pubYear subPubMonth = pubMonth subPubDay = pubDay if subJournal == None: subJournal = journal subJournalISO = journalISO # Extract the title of paper title = articleElem.findall('./front/article-meta/title-group/article-title') + articleElem.findall('./front-stub/title-group/article-title') assert len(title) <= 1 titleText = extractTextFromElemList(title) titleText = [ removeWeirdBracketsFromOldTitles(t) for t in titleText ] # Get the subtitle (if it's there) subtitle = articleElem.findall('./front/article-meta/title-group/subtitle') + articleElem.findall('./front-stub/title-group/subtitle') subtitleText = extractTextFromElemList(subtitle) subtitleText = [ removeWeirdBracketsFromOldTitles(t) for t in subtitleText ] # Extract the abstract from the paper abstract = articleElem.findall('./front/article-meta/abstract') + articleElem.findall('./front-stub/abstract') abstractText = extractTextFromElemList(abstract) # Extract the full text from the paper as well as supplementaries and floating blocks of text articleText = extractTextFromElemList(articleElem.findall('./body')) backText = extractTextFromElemList(articleElem.findall('./back')) floatingText = extractTextFromElemList(articleElem.findall('./floats-group')) document = {'pmid':subPmidText, 'pmcid':subPmcidText, 'doi':subDoiText, 'pubYear':subPubYear, 'pubMonth':subPubMonth, 'pubDay':subPubDay, 'journal':subJournal, 'journalISO':subJournalISO} textSources = {} textSources['title'] = titleText textSources['subtitle'] = subtitleText textSources['abstract'] = abstractText textSources['article'] = articleText textSources['back'] = backText textSources['floating'] = floatingText for k in textSources.keys(): tmp = textSources[k] tmp = [ t for t in tmp if len(t) > 0 ] tmp = [ htmlUnescape(t) for t in tmp ] tmp = [ removeBracketsWithoutWords(t) for t in tmp ] textSources[k] = tmp document['textSources'] = textSources yield document # Less important here (compared to abstracts) as each article file is not too big elem.clear() def trimSentenceLengths(text): MAXLENGTH = 90000 return ".".join( line[:MAXLENGTH] for line in text.split('.') ) def writeMarcXMLRecordToBiocFile(record,biocWriter): metadata = record['008'].value() language = metadata[35:38] if language != 'eng': return recordid = record['001'].value() title = record.title() textSources = [title] abstract = None if '520' in record and 'a' in record['520']: abstract = record['520']['a'] textSources.append(abstract) #print recordid, language, title, abstract biocDoc = bioc.BioCDocument() biocDoc.id = recordid offset = 0 for textSource in textSources: if isinstance(textSource,six.string_types): textSource = trimSentenceLengths(textSource) passage = bioc.BioCPassage() passage.text = textSource passage.offset = offset offset += len(textSource) biocDoc.add_passage(passage) biocWriter.write_document(biocDoc) def uimaxmi2bioc(xmiFilename, biocFilename): tree = etree.parse(xmiFilename) root = tree.getroot() metadataNode = root.find('{http:///de/tudarmstadt/ukp/dkpro/core/api/metadata/type.ecore}DocumentMetaData') documentTitle = metadataNode.attrib['documentTitle'] contentNode = root.find('{http:///uima/cas.ecore}Sofa') content = contentNode.attrib['sofaString'] with bioc.BioCXMLDocumentWriter(biocFilename) as writer: biocDoc = bioc.BioCDocument() biocDoc.id = None biocDoc.infons['title'] = documentTitle passage = bioc.BioCPassage() passage.infons['section'] = 'article' passage.text = content passage.offset = 0 biocDoc.add_passage(passage) writer.write_document(biocDoc) def pubmedxml2bioc(pubmedxmlFilename, biocFilename): with bioc.BioCXMLDocumentWriter(biocFilename) as writer: for pmDoc in processMedlineFile(pubmedxmlFilename): biocDoc = bioc.BioCDocument() biocDoc.id = pmDoc["pmid"] biocDoc.infons['title'] = " ".join(pmDoc["title"]) biocDoc.infons['pmid'] = pmDoc["pmid"] biocDoc.infons['year'] = pmDoc["pubYear"] biocDoc.infons['month'] = pmDoc["pubMonth"] biocDoc.infons['day'] = pmDoc["pubDay"] biocDoc.infons['journal'] = pmDoc["journal"] biocDoc.infons['journalISO'] = pmDoc["journalISO"] biocDoc.infons['authors'] = ", ".join(pmDoc["authors"]) biocDoc.infons['chemicals'] = pmDoc['chemicals'] biocDoc.infons['meshHeadings'] = pmDoc['meshHeadings'] offset = 0 for section in ["title","abstract"]: for textSource in pmDoc[section]: textSource = trimSentenceLengths(textSource) passage = bioc.BioCPassage() passage.infons['section'] = section passage.text = textSource passage.offset = offset offset += len(textSource) biocDoc.add_passage(passage) writer.write_document(biocDoc) allowedSubsections = {"abbreviations","additional information","analysis","author contributions","authors' contributions","authors’ contributions","background","case report","competing interests","conclusion","conclusions","conflict of interest","conflicts of interest","consent","data analysis","data collection","discussion","ethics statement","funding","introduction","limitations","material and methods","materials","materials and methods","measures","method","methods","participants","patients and methods","pre-publication history","related literature","results","results and discussion","statistical analyses","statistical analysis","statistical methods","statistics","study design","summary","supplementary data","supplementary information","supplementary material","supporting information"} def pmcxml2bioc(pmcxmlFilename, biocFilename): try: with bioc.BioCXMLDocumentWriter(biocFilename) as writer: for pmcDoc in processPMCFile(pmcxmlFilename): biocDoc = bioc.BioCDocument() biocDoc.id = pmcDoc["pmid"] biocDoc.infons['title'] = " ".join(pmcDoc["textSources"]["title"]) biocDoc.infons['pmid'] = pmcDoc["pmid"] biocDoc.infons['pmcid'] = pmcDoc["pmcid"] biocDoc.infons['doi'] = pmcDoc["doi"] biocDoc.infons['year'] = pmcDoc["pubYear"] biocDoc.infons['month'] = pmcDoc["pubMonth"] biocDoc.infons['day'] = pmcDoc["pubDay"] biocDoc.infons['journal'] = pmcDoc["journal"] biocDoc.infons['journalISO'] = pmcDoc["journalISO"] offset = 0 for groupName,textSourceGroup in pmcDoc["textSources"].items(): subsection = None for textSource in textSourceGroup: textSource = trimSentenceLengths(textSource) passage = bioc.BioCPassage() subsectionCheck = textSource.lower().strip('01234567890. ') if subsectionCheck in allowedSubsections: subsection = subsectionCheck passage.infons['section'] = groupName passage.infons['subsection'] = subsection passage.text = textSource passage.offset = offset offset += len(textSource) biocDoc.add_passage(passage) writer.write_document(biocDoc) except etree.ParseError: raise RuntimeError("Parsing error in PMC xml file: %s" % pmcxmlFilename) def mergeBioc(biocFilename, outBiocWriter,idFilter): with bioc.BioCXMLDocumentReader(biocFilename) as parser: for biocDoc in parser: if idFilter is None or biocDoc.id in idFilter: outBiocWriter.write_document(biocDoc) def bioc2txt(biocFilename, txtHandle,idFilter): with bioc.BioCXMLDocumentReader(biocFilename) as parser: for biocDoc in parser: if idFilter is None or biocDoc.id in idFilter: for passage in biocDoc.passages: txtHandle.write(passage.text) txtHandle.write("\n\n") def marcxml2bioc(marcxmlFilename,biocFilename): with open(marcxmlFilename,'rb') as inF, bioc.BioCXMLDocumentWriter(biocFilename) as writer: def marcxml2bioc_helper(record): writeMarcXMLRecordToBiocFile(record,writer) pymarc.map_xml(marcxml2bioc_helper,inF) def convertFilesFromFilelist(listFile,inFormat,outFile,outFormat,idFilterListfile=None): with open(listFile) as f: inFiles = json.load(f) idFilterfiles = None if idFilterListfile: with open(idFilterListfile) as f: idFilterfiles = json.load(f) convertFiles(inFiles,inFormat,outFile,outFormat,idFilterfiles) acceptedInFormats = ['bioc','pubmedxml','marcxml','pmcxml','uimaxmi'] acceptedOutFormats = ['bioc','txt'] def convertFiles(inFiles,inFormat,outFile,outFormat,idFilterfiles=None): outBiocHandle,outTxtHandle = None,None if outFormat == 'bioc': outBiocHandle = bioc.BioCXMLDocumentWriter(outFile) elif outFormat == 'txt': outTxtHandle = codecs.open(outFile,'w','utf-8') if idFilterfiles is None: idFilterfiles = [ None for _ in inFiles ] print("Converting %d files to %s" % (len(inFiles),outFile)) for inFile,idFilterfile in zip(inFiles,idFilterfiles): if idFilterfile is None: idFilter = None else: with open(idFilterfile) as f: idFilter = set([ line.strip() for line in f ]) with tempfile.NamedTemporaryFile() as temp: if inFormat == 'bioc': shutil.copyfile(inFile,temp.name) elif inFormat == 'pubmedxml': pubmedxml2bioc(inFile,temp.name) elif inFormat == 'marcxml': marcxml2bioc(inFile,temp.name) elif inFormat == 'pmcxml': pmcxml2bioc(inFile,temp.name) elif inFormat == 'uimaxmi': uimaxmi2bioc(inFile,temp.name) else: raise RuntimeError("Unknown input format: %s" % inFormat) if outFormat == 'bioc': mergeBioc(temp.name,outBiocHandle,idFilter) elif outFormat == 'txt': bioc2txt(temp.name,outTxtHandle,idFilter) else: raise RuntimeError("Unknown output format: %s" % outFormat) print("Output to %s complete" % outFile) def main(): parser = argparse.ArgumentParser(description='Tool to convert corpus between different formats') parser.add_argument('--i',type=str,required=True,help="Comma-delimited list of documents to convert") parser.add_argument('--iFormat',type=str,required=True,help="Format of input corpus. Options: %s" % "/".join(acceptedInFormats)) parser.add_argument('--idFilters',type=str,help="Optional set of ID files to filter the documents by") parser.add_argument('--o',type=str,required=True,help="Where to store resulting converted docs") parser.add_argument('--oFormat',type=str,required=True,help="Format for output corpus. Options: %s" % "/".join(acceptedOutFormats)) args = parser.parse_args() inFormat = args.iFormat.lower() outFormat = args.oFormat.lower() assert inFormat in acceptedInFormats, "%s is not an accepted input format. Options are: %s" % (inFormat, "/".join(acceptedInFormats)) assert outFormat in acceptedOutFormats, "%s is not an accepted output format. Options are: %s" % (outFormat, "/".join(acceptedOutFormats)) inFiles = args.i.split(',') if args.idFilters: idFilterfiles = args.idFilters.split(',') assert len(inFiles) == len(idFilterfiles), "There must be the same number of input files as idFilters files" else: idFilterfiles = None convertFiles(inFiles,inFormat,args.o,outFormat,idFilterfiles)