python source code of subset

#!/usr/bin/python
# -*- coding: utf-8 -*-

""" 
**************************************************
conversion.py
Author: Damion Dooley

This script generates SLIM files for indexed Langual food databases.
An issue is that often the 3rd party databases have their own ID scheme.
To reuse LanguaL updates we would need to translate existing db food term ids
over to persistent LanguaL ids.  This works for a few databases of [alpha][digits] format.

The script loads the database.json and langual_facet_a.json database in order
to map each indexed food term over into an equivalency conjunction logic statement.

Subsets are outputted to the current directory.  To utilize them in a FoodOn install,
move them up a level and import them into your instance of FoodOn via Protege or manually
by editing the top of foodon-edit.owl or foodon.owl 

INPUT
    ./template_import_header.txt
    ../langual/database.json
    ../langual/langual_facet_a.json
    ./[ontology name].TXT # A tab-delimited data file

OUTPUT
    ./[ontology name]_import.owl.txt

    Output has .txt on end to ensure that Protege doesn't reference it.  
    All references should just be to files in the /imports folder directly,
    so if using it, move output file up to /imports and rename.

**************************************************
"""
import json
import optparse
import sys
import os.path
import codecs
import re
import time
import requests

try: #Python 2.7
    from collections import OrderedDict
except ImportError: # Python 2.6
    from ordereddict import OrderedDict

#FOR LOADING JSON AND PRESERVING ORDERED DICT SORTING. 
try:    
    import simplejson as json
except ImportError: # Python 2.6
    import json


CODE_VERSION = '0.0.1'

def stop_err( msg, exit_code=1 ):
    sys.stderr.write("%s\n" % msg)
    sys.exit(exit_code)

class MyParser(optparse.OptionParser):
    """
    Allows formatted help info.
    """
    def format_epilog(self, formatter):
        return self.epilog


class Langual(object):

    def __init__(self):

        # READ THIS FROM database.json
        self.database = {} 
        self.database_path = '../langual/database.json' 
        self.product_type_path = '../langual/langual_facet_a.json'
        self.ontology_name = ''
        self.subsetIdStart = None  
        self.counts = {}
        self.label_reverse_lookup = {}

        self.get_database_JSON()


    def __main__(self, subsetName, subsetInputFilePath, subsetIdStart, language):
        """

        """
        self.ontology_name = subsetName
        self.subsetIdStart = subsetIdStart # Slim entries start from here

        print "Generating subset " + self.ontology_name

        owl_output_rdf = ''

        with (open(subsetInputFilePath, 'U')) as input_handle:
            for line in input_handle:
                # Lines in input are :Database native id[tab]label[tab]LanguaL ids.
                params = line.strip().split('\t')
                if len(params) == 4 and params[0] != 'FOODID':
                    entity = {}
                    (importId, label, altLabel, langualCodes) = params
                    label = label.strip()
                    altLabel = altLabel.strip()
                    langualCodes = langualCodes.strip()
                    if (len(importId)>0 and len(label)>0 and len(langualCodes)>0):
                        entity['import_id'] = importId
                        entity['label'] = label.strip().lower()
                        entity['langual_ids'] = langualCodes.split()
                        # Translates ids to FoodOn id range.
                        entity['id'] = self.get_new_subset_id(importId) 
                        entity['language'] = language
                        owl_output_rdf += self.subset_entry_render(entity)

        print "Saving ../" + self.ontology_name + '_import.owl'
        
        self.save_subset_owl(owl_output_rdf)


    #************************************************************

    def subset_entry_render(self, entity):
        """
        Enhance entity with LanguaL facet-specific attributes.  Facet letters D,I,L,O don't exist in LanguaL.
        """ 

        prefix = '&obo;'

        facet_relations_rdf = ''
        owl_output = ''

        for langual_id in entity['langual_ids']:

            # Lookup existing LanguaL entity if any.
            if not (langual_id in self.database['index']):
                print 'Unable to find id "' + langual_id + '" in LanguaL database.'
                continue

            refEntity = self.database['index'][langual_id]
            # We skip doing references to "ignore" items
            if refEntity['status'] == 'ignore': 
                continue

            # If item is depreciated, then if it is a [food item]_added, and the [food item] exists
            # If so, change refEntity to that item
            label = refEntity['label']['value'].lower()
            if refEntity['status'] == 'deprecated':
                if refEntity['database_id'][0] == 'H' and label[-6:] == ' added' and label[0:-6] in self.label_reverse_lookup:
                    print "Replaced secondary ingredient with ", label[0:-6]
                    refEntity = self.label_reverse_lookup[ label[0:-6] ]
                # These are junky parts of conjunction
                elif label[0:3] == 'no ' or label[-10:] == ' not known' or label[-14:] == 'not applicable': 
                    continue

            # Stats on count of members of each LanguaL facet, which is first letter of entity id.
            category = langual_id[0]
            if category in self.counts: 
                self.counts[category] += 1 
            else: 
                self.counts[category] = 1

            # To do OWL links we have to refer to the entity's ontology id.
            ontology_id = refEntity['ontology_id'] 
            relation = None

            # A. PRODUCT TYPE [A0361]
            # A particular database/subset may place a product under one or more Product Type Hierarchies, e.g. an US FDA one.
            if category == 'A':
                owl_output += '\t<rdfs:subClassOf rdf:resource="%s%s"/>\n' % (prefix, ontology_id)

            # B. FOOD SOURCE [B1564]
            # This is always the primary ingredient, attached using the 'has primary substance added'
            # - Includes raw animal, plant, bacteria and fungi ingredients.
            if category == 'B': relation = '&obo;RO_0009005' # Has primary substance added'.  Awaiting RO relation

            # C. PART OF PLANT OR ANIMAL [C0116]
            elif category == 'C': relation = '&obo;RO_0001000' # Derives from

            # E. PHYSICAL STATE, SHAPE OR FORM [E0113]
            elif category == 'E': relation = '&obo;RO_0000086' # Has Quality

            # F. EXTENT OF HEAT TREATMENT [F0011]
            elif category == 'F': relation = '&obo;RO_0000086' # Has Quality

            # G. COOKING METHOD [G0002]
            elif category == 'G': relation = '&obo;RO_0002354' # formed as a result of

            #H. TREATMENT APPLIED [H0111]
            elif category == 'H': 
                if label[-6:] == ' added':
                    # Exception: if word " added" at end, then "has substance added" and keep 
                    # deprecated reference in order to address this later.
                    relation = '&obo;RO_0009001' # "has substance added"
                else:
                    relation = '&obo;RO_0002354' # formed as a result of
            

            #J. PRESERVATION METHOD [J0107]
            elif category == 'J': relation = '&obo;RO_0002354' # formed as a result of

            #K. PACKING MEDIUM [K0020]
            elif category == 'K' and langual_id != 'K0003': 
                relation = '&obo;RO_0009003' # Immersed in.

            #M. CONTAINER OR WRAPPING [M0100]
            elif category == 'M': relation = '&obo;PATO_0005016' # surrounded by / RO_0002002 has 2D boundary 

            #N. FOOD CONTACT SURFACE [N0010]
            elif category == 'N': relation = '&obo;RO_0002220' # Adjacent to (AT SOME POINT IN TIME)

            #P. CONSUMER GROUP/DIETARY USE/LABEL CLAIM [P0032]
            elif category == 'P': 
                if langual_id == 'P0024': # ignore all 'human consumer, no age specification'; this is handled through inheritance.
                    relation = None
                else:
                    relation = '&obo;RO_0009004' # Has Consumer / RO_0000086 has Quality

            #R. GEOGRAPHIC PLACES AND REGIONS [R0010]
            elif category == 'R': relation = 'http://www.ebi.ac.uk/ancestro/ancestro_0308' # Has country of origin

            #Z. ADJUNCT CHARACTERISTICS OF FOOD [Z0005]
            elif category == 'Z': relation = '&obo;RO_0000086' # Has Quality
            
            if relation:
                facet_relations_rdf += '''
                        <owl:Restriction>
                            <owl:onProperty rdf:resource="%s"/>
                            <owl:someValuesFrom rdf:resource="&obo;%s"/>
                        </owl:Restriction>
            ''' % (relation, ontology_id)

        # BEGIN <owl:Class> 
        owl_output = '\n\n<owl:Class rdf:about="%s%s">\n' % (prefix, entity['id']) + owl_output

        # Class Label
        label = entity['label'].replace('<',r'&lt;').replace('>',r'&gt;')
        labelLang = self.get_language_tag_owl(entity)

        # Definition, for now duplicating label
        title = label.split(',',1)
        title[0] = title[0].lower() # .title()

        label = ''
        definition = ''
        if len(title) > 1:
            label = ' (' + title[1].strip() +')' 
            definition = ': ' + title[1].strip()
        # All langual indexed foods are food products.  Stating this here to make it distinct from 
        # food source items that may have same name.
        elif 'en' in entity['language'] and not 'product' in title[0]:
            title[0] = title[0] + ' (food product)'

        # Some extra fancy work to make title look like [food type] ([details]) , and definition like [food type]: [details]
        owl_output += '\t<rdfs:label %(language)s>%(label)s</rdfs:label>\n' % { 'label': title[0] + label, 'language': labelLang}
        # Skip definition.  Future: Lookup as many as possible via wikipedia etc.
        #owl_output += '\t<obo:IAO_0000115 %(language)s>%(label)s</obo:IAO_0000115>\n' % { 'label': title[0] + definition, 'language': labelLang}

        # LanguaL import annotation
        owl_output += "\t<obo:IAO_0000412>http://langual.org</obo:IAO_0000412>\n"

        # Slim definition
        owl_output += "\t<oboInOwl:inSubset>%s</oboInOwl:inSubset>\n" % self.ontology_name

        # All Slim entries are 'ready for release' IAO_0000122
        # Other possibility: 'requires discussion' IAO_0000428
        owl_output += '\t<obo:IAO_0000114 rdf:resource="&obo;IAO_0000428"/>\n'

        owl_output += '\t<oboInOwl:hasDbXref>%s:%s</oboInOwl:hasDbXref>\n' % (self.ontology_name.upper(), entity['import_id'] )

        if len(facet_relations_rdf):
            #  <rdf:Description rdf:about="&obo;%s"/>
            owl_output += '''
    <owl:equivalentClass>
        <owl:Class>
            <owl:intersectionOf rdf:parseType="Collection">
                %s
            </owl:intersectionOf>
        </owl:Class>
    </owl:equivalentClass>
            ''' % (facet_relations_rdf)  #entity['id'], 

        owl_output += '\n</owl:Class>'
    
        return owl_output


    def get_new_subset_id(self, id):
        """
        SLIM item id is mapped over to FOODON_ namespace such that subsequent 
        loads of the SLIM items preserve same ids.
        """ 
        numericId = ''.join(i for i in id if i.isdigit()).lstrip('0') # may contain leading 0's
        return 'FOODON_' + format(self.subsetIdStart + int(numericId), '08' ) # padded with 0 to 8 digits


    def save_subset_owl(self, owl_output_rdf):
        """
        Generate [subset]_import.owl ontology file.

        """
        # DON'T CALL THIS XYZ.owl - the Makefile make reads in subdirectories and will try to parse this, and fail.
        with (open('./template_import_header.txt', 'r')) as input_handle:
            owl_template = input_handle.read()

        # SUBSTITUTE ONTOLOGY NAME
        owl_template = owl_template.replace('ONTOLOGY_NAME', self.ontology_name + '_import')
        owl_template += owl_output_rdf 
        owl_template += '</rdf:RDF>'
        
        with (codecs.open('./' + self.ontology_name + '_import.owl.txt', 'w', 'utf-8')) as output_handle:
            output_handle.write(owl_template)


    def get_language_tag(self, entity):
        if 'language' in entity:
            return '@' + entity['language']
        else:
            return ''


    def get_language_tag_owl(self, entity):
        if 'language' in entity:
            return 'xml:lang="' + entity['language'] + '"'
        else:
            return ''


    def get_database_JSON(self):
        """
        Load existing JSON representation of import database (created last time OWL ontology was saved)
        Will be updated if database has changed.
        """

        with open(self.database_path) as data_file:    
            dbObject = json.load(data_file, object_pairs_hook=OrderedDict) 

        with open(self.product_type_path) as data_file:
            dbObject2 = json.load(data_file, object_pairs_hook=OrderedDict)
        
        for item in dbObject2['index']:
            dbObject['index'][item] = dbObject2['index'][item]

        self.database = dbObject

        # Create a reverse-lookup index by food source label, or extract, concentrate etc.
        for item in dbObject['index']:
            # C0228 == extract, concentrate or isolate of plant or animal
            if item[0] == 'B' or self.itemAncestor(item, ['C0228']): 
                entity = dbObject['index'][item]
                if not (entity['status'] == 'deprecated' or entity['status'] == 'ignore'):
                    self.label_reverse_lookup[entity['label']['value'].lower()] = entity


    def itemAncestor(self, item, ancestors):
        # Determine if item has ancestor in ancestors array.
        stack = [item]
        tried = []
        while len(stack):
            langualID = stack.pop(0)
            tried.append(langualID)
            if langualID in self.database['index']:
                for parent in self.database['index'][langualID]['is_a']:
                    entity = self.database['index'][langualID]['is_a'][parent]
                    parentId = entity['value']
                    if parentId in ancestors:
                        return True
                    elif parentId in self.database['index'] and parentId not in tried:
                        stack.append(parentId) 
        return False


if __name__ == '__main__':


    # Generates Slim for given input file.
    foodstruct = Langual()
    # See http://www.langual.org/langual_indexed_datasets.asp for list of indexed food databases
    # A version of the SIREN food index has been done and moved to imports folder
    foodstruct.__main__('subset_siren', './DBFSIREN.TXT', 3300000, 'en') #F1000 - F17788 

    # Main LanguaL import facet terms occupy FoodOn ids in range 3,400,000 -> 3,420,000

    #foodstruct.__main__('subset_caroteno', './CAROTENO.TXT', 3444000, 'en') # CR0010 - CR4162
    #foodstruct.__main__('subset_usda_sr8', './USDA Standard Reference 8.TXT', 3450000, 'en') # 1001 - 21140
    #foodstruct.__main__('subset_french', './FRENCH.TXT', 3500000, 'fr') # FR03010 - FR51572 (RECORD FR99999 REMOVED)

    # NOT DONE YET... id mapping issue.
    #foodstruct.__main__('subset_who', './WHO.TXT', 3300000, 'en') # ISSUE: some numeric ID's end in "A" to avoid duplicates
    #foodstruct.__main__('subset_codex', './CODEX.TXT', 3300000, 'en') # CX[A-163]-[...]