Python codecs.open() Examples

The following are 30 code examples of codecs.open(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module codecs , or try the search function .
Example #1
Source File: NLP.py    From Financial-NLP with Apache License 2.0 8 votes vote down vote up
def txt2sentence(self, filename):    
        """
        read a <cut_file> and return an iterator sentences
        (that is a list of some lists, and the second 'list' is a list of words ).
        """
        sentences=[]
        try:
            fp=open(filename,'r',encoding='utf-8')
            lines=fp.readlines()
        except:
            fp=open(filename,'r',encoding='gbk')
            lines=fp.readlines()

        for line in lines:
            line = line.strip()
            if len(line)<=1:
                continue
            line=line.replace('\n','').replace('\r','').split(' ')
            sentences.append(line)
        return sentences 
Example #2
Source File: NLP.py    From Financial-NLP with Apache License 2.0 7 votes vote down vote up
def loadWordNet(self):
        """
        load zh_wordnet into the object.
        将cow-not-full文件中的数据集整合成set
        """
        f = codecs.open(self.wordnet_txt, "rb", "utf-8")
        self.known = dict()
        #self.known = set()
        for l in f:
            if l.startswith('\ufeff#') or not l.strip():
                continue
            row = l.strip().split("\t")
            (synset,lemma)=row
            #if len(row) == 2:
            #    (synset, lemma) = row 
            #elif len(row) == 3:
            #    (synset, lemma, status) = row #根本就没有三个东西的项
            #else:
            #    print("illformed line: ", l.strip())
            #if not (synset.strip(), lemma.strip()) in self.known:
            #    self.known.add((synset.strip(), lemma.strip()))
            if not lemma.strip() in self.known.keys():
                self.known[lemma.strip()]=[]
            self.known[lemma.strip()].append(synset) 
Example #3
Source File: preprocess.py    From open-sesame with Apache License 2.0 6 votes vote down vote up
def write_to_conll(outf, fsp, firstex, sentid):
    mode = "a"
    if firstex:
        mode = "w"

    with codecs.open(outf, mode, "utf-8") as outf:
        for i in xrange(fsp.sent.size()):
            token, postag, nltkpostag, nltklemma, lu, frm, role = fsp.info_at_idx(i)

            outf.write(str(i+1) + "\t") # ID = 0
            outf.write(token.encode('utf-8') + "\t") # FORM = 1
            outf.write("_\t" + nltklemma + "\t") # LEMMA PLEMMA = 2,3
            outf.write(postag + "\t" + nltkpostag + "\t") # POS PPOS = 4,5
            outf.write(str(sentid-1) + "\t_\t") # FEAT PFEAT = 6,7 ~ replacing FEAT with sentence number
            outf.write("_\t_\t") # HEAD PHEAD = 8,9
            outf.write("_\t_\t") # DEPREL PDEPREL = 10,11
            outf.write(lu + "\t" + frm + "\t") # FILLPRED PRED = 12,13
            outf.write(role + "\n") #APREDS = 14

        outf.write("\n") # end of sentence
        outf.close() 
Example #4
Source File: utils.py    From Att-ChemdNER with Apache License 2.0 6 votes vote down vote up
def get_perf(filename):
    ''' run conlleval.pl perl script to obtain
    precision/recall and F1 score '''
    _conlleval = PREFIX + 'conlleval'
    if not isfile(_conlleval):
        #download('http://www-etud.iro.umontreal.ca/~mesnilgr/atis/conlleval.pl') 
        os.system('wget https://www.comp.nus.edu.sg/%7Ekanmy/courses/practicalNLP_2008/packages/conlleval.pl')
        chmod('conlleval.pl', stat.S_IRWXU) # give the execute permissions
    
    out = []
    proc = subprocess.Popen(["perl", _conlleval], stdin=subprocess.PIPE, stdout=subprocess.PIPE)
    stdout, _ = proc.communicate(open(filename).read())
    for line in stdout.split('\n'):
        if 'accuracy' in line:
            out = line.split()
            break
    
    # out = ['accuracy:', '16.26%;', 'precision:', '0.00%;', 'recall:', '0.00%;', 'FB1:', '0.00']
    precision = float(out[3][:-2])
    recall    = float(out[5][:-2])
    f1score   = float(out[7])

    return {'p':precision, 'r':recall, 'f1':f1score} 
Example #5
Source File: trip-advisor-crawler.py    From trip-advisor-crawler with GNU General Public License v3.0 6 votes vote down vote up
def getreview(domain, cityid, activity, reviewid, timeout, maxretries, basepath, force, pause):
    baseurl = 'http://www.tripadvisor.' + domain + '/ShowUserReviews-g'
    reviewurl = '%s%s-d%s-r%s' % (baseurl, cityid, activity, reviewid)

    path = os.sep.join((basepath, domain, str(cityid), str(activity)))
    filename = os.sep.join((path, str(reviewid) + '.html'))
    if force or not os.path.exists(filename):
        htmlpage = download_page(reviewurl, maxretries, timeout, pause)

        if htmlpage is None:
            print('Error downloading the review URL: ' + reviewurl)
        else:
            if not os.path.exists(path):
                os.makedirs(path)

            with codecs.open(filename, mode='w', encoding='utf8') as file:
                file.write(htmlpage.decode('utf-8')) 
Example #6
Source File: model.py    From Att-ChemdNER with Apache License 2.0 6 votes vote down vote up
def save_mappings(self, id_to_word, id_to_char, id_to_tag):
#{{{
        """
        We need to save the mappings if we want to use the model later.
        """
        self.id_to_word = id_to_word
        self.id_to_char = id_to_char
        self.id_to_tag = id_to_tag
        with open(self.mappings_path, 'wb') as f:
            mappings = {
                'id_to_word': self.id_to_word,
                'id_to_char': self.id_to_char,
                'id_to_tag': self.id_to_tag,
            }
            cPickle.dump(mappings, f)
#}}} 
Example #7
Source File: bpe.py    From keras-gpt-2 with MIT License 6 votes vote down vote up
def get_bpe_from_files(encoder_path, vocab_path):
    """Get initialized BPE.

    :param encoder_path: Path to 'encoder.json'.
    :param vocab_path: Path to 'vocab.bpe'
    :return: The object from encode and decode strings.
    """
    with codecs.open(encoder_path, 'r', 'utf8') as reader:
        token_dict = json.load(reader)
    bpe_rank = {}
    with codecs.open(vocab_path, 'r', 'utf8') as reader:
        reader.readline()
        for rank, line in enumerate(reader):
            line = line.strip()
            if line:
                bpe_rank[tuple(line.split())] = rank
    return BytePairEncoding(token_dict, bpe_rank) 
Example #8
Source File: utils.py    From DOTA_models with Apache License 2.0 6 votes vote down vote up
def bodpolyToRec(srcpath, dstpath):
    #dstpath = os.path.join(r'E:\bod-dataset\patches\subcategorylabel\results\ReclabelTxt')
    filelist = GetFileFromThisRootDir(srcpath)
    namelist = [mybasename(x.strip()) for x in filelist]
    for basename in namelist:
#            objects = parse_bod_poly(os.path.join(self.labelpath, basename + '.txt'))
        objects = parse_bod_poly(os.path.join(srcpath,basename + '.txt'))
        f_out = codecs.open(os.path.join(dstpath, basename + '.txt'), 'w', 'utf_16')
        for obj in objects:
            bbox = dots4ToRec8(obj['poly'])
            name = obj['name']
            difficult = obj['difficult']
            bbox = list(map(str, bbox))
            outline = ' '.join(bbox)
            outline = outline + ' ' + name
            if difficult:
                outline = outline + ' ' + str(difficult)
            f_out.write(outline + '\n') 
Example #9
Source File: loader.py    From Att-ChemdNER with Apache License 2.0 6 votes vote down vote up
def load_sentences(path, lower, zeros):
#{{{
    """
    Load sentences. A line must contain at least a word and its tag.
    Sentences are separated by empty lines.
    """
    sentences = []
    sentence = []
    for line in codecs.open(path, 'r', 'utf8'):
        line = zero_digits(line.rstrip()) if zeros else line.rstrip()
        if not line:
            if len(sentence) > 0:
                if 'DOCSTART' not in sentence[0][0]:
                    sentences.append(sentence)
                sentence = []
        else:
            word = line.split()
            assert len(word) >= 2
            sentence.append(word)
    if len(sentence) > 0:
        if 'DOCSTART' not in sentence[0][0]:
            sentences.append(sentence)
    return sentences
#}}} 
Example #10
Source File: load_base.py    From lyrebird-api-coverage with MIT License 6 votes vote down vote up
def auto_load_base():
    lyrebird_conf = lyrebird.context.application.conf
    # 读取指定base文件,写入到base.json
    if lyrebird_conf.get('hunter.base'):
        base_path = lyrebird_conf.get('hunter.base')
        base = codecs.open(base_path, 'r', 'utf-8').read()
        f = codecs.open(DEFAULT_BASE, 'w', 'utf-8')
        f.write(base)
        f.close()
        app_context.base_sha1 = get_file_sha1(DEFAULT_BASE)
        return json.loads(base)
    # 通过本地默认base文件获取base
    elif not os.path.exists(DEFAULT_BASE):
        copy_file(DEFAULT_BASE)
    with codecs.open(DEFAULT_BASE, 'r', 'utf-8') as f:
        json_obj = json.load(f)
        app_context.base_sha1 = get_file_sha1(DEFAULT_BASE)
        return json_obj 
Example #11
Source File: utils.py    From DOTA_models with Apache License 2.0 6 votes vote down vote up
def wordlabel2dark(self):
        filelist = GetFileFromThisRootDir(self.wordlabelpath)
        #print(filelist)
        for fullname in filelist:
            objects = parse_bod_poly(fullname)
            name = mybasename(fullname)
            with open(os.path.join(self.darkpath, name + '.txt'), 'w') as f_out:
                for obj in objects:
                    poly = obj['poly']
                    bbox = np.array(dots4ToRecC(poly)) / 1024
                    ## note: the box is x_center, y_center, w, h, that means the whole box can be out of border
                    if (str(obj['difficult']) == '1'):
                        continue
                    if (sum(bbox <= 0) + sum(bbox >= 1)) >= 1:
                        continue
                    if (obj['name'] in wordname_15):
                        id = wordname_15.index(obj['name'])
                    else:
                        continue
                    outline = str(id) + ' ' + ' '.join(list(map(str, bbox)))
                    f_out.write(outline + '\n') 
Example #12
Source File: utils.py    From DOTA_models with Apache License 2.0 6 votes vote down vote up
def TransTo15Word_gt(self):
        dstpath = r'wordlabel'
        self.ParseTxtAndWrite(self.labelpath, dstpath, datamap_15)
    # def TransTo15class(self, path):
    #     filelist = GetFileFromThisRootDir(self.labelpath)
    #     for fullname in filelist:
    #         objects = parse_bod_poly2(fullname)
    #         name = mybasename(fullname)
    #         outname = os.path.join(self.basepath, path, name + '.txt')
    #         f_out = codecs.open(outname, 'w', 'utf_16')
    #
    #         for obj in objects:
    #             if obj['name'] in classname_15:
    #                 if path == 'wordlabel':
    #                     outline = ' '.join(map(str, obj['poly'])) + ' ' + datamap_15[obj['name']] + ' ' + str(obj['difficult'])
    #                     print('outline:', outline)
    #                     #f_out.write(outline + '\n')
    #                 elif path == 'label15Txt':
    #                     outline = ' '.join(map(str, obj['poly'])) + ' ' + obj['name'] + ' ' + str(obj['difficult'])
    #                     print('outline:', outline)
    #                     f_out.write(outline + '\n') 
Example #13
Source File: utils.py    From DOTA_models with Apache License 2.0 6 votes vote down vote up
def bodpolyToRec(self, label):
        Recpath = os.path.join(self.basepath, r'ReclabelTxt')
        for basename in self.namelist:
#            objects = parse_bod_poly(os.path.join(self.labelpath, basename + '.txt'))
            objects = parse_bod_poly(os.path.join(self.basepath, label, basename + '.txt'))
            f_out = codecs.open(os.path.join(Recpath, basename + '.txt'), 'w', 'utf_16')
            for obj in objects:
                bbox = dots4ToRec8(obj['poly'])
                name = obj['name']
                difficult = obj['difficult']
                bbox = list(map(str, bbox))
                outline = ' '.join(bbox)
                outline = outline + ' ' + name
                if difficult:
                    outline = outline + ' ' + str(difficult)
                f_out.write(outline + '\n') 
Example #14
Source File: utils.py    From DOTA_models with Apache License 2.0 6 votes vote down vote up
def bod2darknet(subpath, label, extractclassname):
    labelpath = os.path.join(subpath, label)
    filelist = GetFileFromThisRootDir(labelpath)
    outpath = r'/home/dj/data/bod-subset/labels'
    for fullname in filelist:
        objects = parse_bod_poly(fullname)
        name = os.path.splitext(os.path.basename(fullname))[0]
        with open(os.path.join(outpath, name + '.txt'), 'w') as f_out:
            for obj in objects:
                poly = obj['poly']
                bbox = np.array(dots4ToRecC(poly)) / 1024
                if (sum(bbox <= 0) + sum(bbox >= 1)) >= 1:
                    continue
                if (obj['name'] in extractclassname):
                    id = extractclassname.index(obj['name'])
                else:
                    continue
                outline = str(id) + ' ' + ' '.join(list(map(str, bbox)))
                f_out.write(outline + '\n') 
Example #15
Source File: Self.py    From CyberTK-Self with GNU General Public License v2.0 6 votes vote down vote up
def sendVoice(self, to_, path):
        M = Message(to=to_, text=None, contentType = 3)
        M.contentPreview = None
        M_id = self._client.sendMessage(0,M).id
        files = {
            'file': open(path, 'rb'),
        }
        params = {
            'name': 'voice_message',
            'oid': M_id,
            'size': len(open(path, 'rb').read()),
            'type': 'audio',
            'ver': '1.0',
        }
        data = {
            'params': json.dumps(params)
        }
        r = self.post_content('https://os.line.naver.jp/talk/m/upload.nhn', data=data, files=files)
        if r.status_code != 201:
            raise Exception('Upload voice failure.')
        return True 
Example #16
Source File: setup.py    From L.E.S.M.A with Apache License 2.0 6 votes vote down vote up
def find_version(*file_paths):
    # Open in Latin-1 so that we avoid encoding errors.
    # Use codecs.open for Python 2 compatibility
    try:
        f = codecs.open(os.path.join(here, *file_paths), 'r', 'latin1')
        version_file = f.read()
        f.close()
    except:
        raise RuntimeError("Unable to find version string.")

    # The version line must have the form
    # __version__ = 'ver'
    version_match = re.search(r"^__version__ = ['\"]([^'\"]*)['\"]",
                              version_file, re.M)
    if version_match:
        return version_match.group(1)
    raise RuntimeError("Unable to find version string.")


# Get the long description from the relevant file 
Example #17
Source File: Self.py    From CyberTK-Self with GNU General Public License v2.0 6 votes vote down vote up
def sendAudio(self, to_, path):
        M = Message(to=to_, text=None, contentType = 3)
        M_id = self.Talk.client.sendMessage(0,M).id
        files = {
            'file': open(path, 'rb'),
        }
        params = {
            'name': 'media',
            'oid': M_id,
            'size': len(open(path, 'rb').read()),
            'type': 'audio',
            'ver': '1.0',
        }
        data = {
            'params': json.dumps(params)            
        }       

        r = self.post_content('https://os.line.naver.jp/talk/m/upload.nhn', data=data, files=files)
        print r
        if r.status_code != 201:
            raise Exception('Upload audio failure.') 
Example #18
Source File: formatters.py    From ciocheck with MIT License 6 votes vote down vote up
def _add_missing_init_py(self, paths):
        """Add missing __init__.py files in the module subdirectories."""
        results = []
        folders = [os.path.dirname(p) for p in paths]

        # Avoid adding an init on repo level if setup.py or other script on the
        # top level has changed
        if self.cmd_root in folders:
            folders.remove(self.cmd_root)

        for folder in folders:
            init_py = os.path.join(folder, "__init__.py")
            exists = os.path.exists(init_py)
            if not exists:
                with codecs.open(init_py, 'w', 'utf-8') as handle:
                    handle.flush()
                result = {
                    'path': init_py,
                    'created': not exists,
                    'diff': diff('', ''),
                    'error': None,
                }
                results.append(result)
        return results 
Example #19
Source File: Self.py    From CyberTK-Self with GNU General Public License v2.0 6 votes vote down vote up
def sendImage(self, to_, path):
      M = Message(to=to_,contentType = 1)
      M.contentMetadata = None
      M.contentPreview = None
      M_id = self.Talk.client.sendMessage(0,M).id
      files = {
         'file': open(path, 'rb'),
      }
      params = {
         'name': 'media',
         'oid': M_id,
         'size': len(open(path, 'rb').read()),
         'type': 'image',
         'ver': '1.0',
      }
      data = {
         'params': json.dumps(params)
      }
      r = self.post_content('https://os.line.naver.jp/talk/m/upload.nhn', data=data, files=files)
      if r.status_code != 201:
         raise Exception('Upload image failure.')
      return True 
Example #20
Source File: Segmentation.py    From text-rank with MIT License 6 votes vote down vote up
def __init__(self, stop_words_file = None, allow_speech_tags = util.allow_speech_tags):
        """
        Keyword arguments:
        stop_words_file    -- 保存停止词的文件路径,utf8编码,每行一个停止词。若不是str类型,则使用默认的停止词
        allow_speech_tags  -- 词性列表,用于过滤
        """     
        
        allow_speech_tags = [util.as_text(item) for item in allow_speech_tags]

        self.default_speech_tag_filter = allow_speech_tags
        self.stop_words = set()
        self.stop_words_file = get_default_stop_words_file()
        if type(stop_words_file) is str:
            self.stop_words_file = stop_words_file
        for word in codecs.open(self.stop_words_file, 'r', 'utf-8', 'ignore'):
            self.stop_words.add(word.strip()) 
Example #21
Source File: data_helpers.py    From dynamic-training-with-apache-mxnet-on-aws with Apache License 2.0 6 votes vote down vote up
def load_data_and_labels():
    """
    Loads MR polarity data from files, splits the data into words and generates labels.
    Returns split sentences and labels.
    """
    # download dataset
    get_chinese_text()

    # Load data from files
    positive_examples = list(codecs.open("./data/pos.txt", "r", "utf-8").readlines())
    positive_examples = [s.strip() for s in positive_examples]
    positive_examples = [pe for pe in positive_examples if len(pe) < 100]
    negative_examples = list(codecs.open("./data/neg.txt", "r", "utf-8").readlines())
    negative_examples = [s.strip() for s in negative_examples]
    negative_examples = [ne for ne in negative_examples if len(ne) < 100]
    # Split by words
    x_text = positive_examples + negative_examples
    # x_text = [clean_str(sent) for sent in x_text]
    x_text = [list(s) for s in x_text]

    # Generate labels
    positive_labels = [[0, 1] for _ in positive_examples]
    negative_labels = [[1, 0] for _ in negative_examples]
    y = np.concatenate([positive_labels, negative_labels], 0)
    return [x_text, y] 
Example #22
Source File: preprocess.py    From open-sesame with Apache License 2.0 5 votes vote down vote up
def filter_embeddings(embedding_files):
    """
    Filters the embeddings file to retain only the vocabulary in the train, dev and test files.
    """
    sys.stderr.write("\nReading FrameNet {} vocabulary...\n".format(VERSION))
    vocab = set([])
    corpora = [DEV_CONLL, TRAIN_FTE, TRAIN_EXEMPLAR, TEST_CONLL]
    for corpus in corpora:
        with codecs.open(corpus, "r", "utf-8") as cf:
            tokens = [line.split("\t")[1].lower() for line in cf  if line != "\n"]
            cf.close()
        vocab.update(tokens)
    sys.stderr.write("\nTotal (train + dev + test) vocabulary size = {}\nFiltering out the word vectors ...".format(len(vocab)))

    for emb_file in embedding_files:
        embeddings_file = open(DATA_DIR + emb_file, 'r')
        new_embeddings_file = DATA_DIR.split(".txt")[0] + VERSION + ".framevocab.txt"
        filtered_embeddings = open(new_embeddings_file, 'w')
        num_embeddings = 0
        for l in embeddings_file:
            fields = l.strip().split(' ')
            wd = fields[0].lower()
            if wd in vocab:
                filtered_embeddings.write(l)
                num_embeddings += 1
        embeddings_file.close()
        filtered_embeddings.close()
        sys.stderr.write("\nTotal embeddings in {} = {}\n".format(new_embeddings_file, num_embeddings)) 
Example #23
Source File: utils.py    From DOTA_models with Apache License 2.0 5 votes vote down vote up
def GetListFromfile(fullname):
    with open(fullname, 'r') as f:
        lines = f.readlines()
        names = {x.strip() for x in lines}
    return names 
Example #24
Source File: setup.py    From tvdbsimple with GNU General Public License v3.0 5 votes vote down vote up
def read(fname):

    here = path.join(path.abspath(path.dirname(__file__)), fname)
    txt = ''
    if (path.isfile(here)):
        # Get the long description from the README file
        with open(here, encoding='utf-8') as f:
            txt= f.read()
    return txt 
Example #25
Source File: preprocess_syntax.py    From open-sesame with Apache License 2.0 5 votes vote down vote up
def join_fnconll_parseyconll(conllfile, synfile, outfile):
    with codecs.open(outfile, "w", "utf-8") as outf:
        with codecs.open(conllfile, "r", "utf-8") as cf:
            with codecs.open(synfile, "r", "utf-8") as sf:
                for l,sl in izip(cf,sf):

                    cfields = l.strip().split("\t")
                    if len(cfields) == 1:
                        outf.write("\n")
                        continue

                    if len(cfields) != 15:
                        raise Exception("incorrect CoNLL 2009 format", l, cfields)

                    sfields = sl.strip().split("\t")
                    if len(sfields) != 10:
                        raise Exception("incorrect parsey CoNLL format")

                    newfields = cfields[:4] # ID FORM LEMMA PLEMMA = 0,1,2,3
                    newfields += sfields[3:6:2] # syntaxnetPOS fnPOS = 4,5  ~ replacing POS PPOS
                    newfields += cfields[6:9] # sent_num PFEAT HEAD = 6,7,8 ~ replacing FEAT PFEAT HEAD
                    newfields += sfields[6:7] # syntaxnetHEAD = 9           ~ replacing PHEAD
                    newfields += cfields[10:11] # DEPREL = 10
                    newfields += sfields[7:8] # syntaxnetDEPREL = 11        ~ replacing PDEPREL
                    newfields += cfields[12:] # FILLPRED PRED APREDS = 12,13,14
                    if len(newfields) != len(cfields):
                        raise Exception("didn't join properly", len(newfields), len(cfields), newfields)
                    outf.write("\t".join(newfields) + "\n")
                sf.close()
            cf.close()
        outf.close() 
Example #26
Source File: data.py    From dynamic-training-with-apache-mxnet-on-aws with Apache License 2.0 5 votes vote down vote up
def from_file(filename):
        vocab = Vocabulary()
        with codecs.open(filename, "r", "utf-8") as f:
            for line in f:
                word, count = line.strip().split()
                vocab.add(word, int(count))
        vocab.finalize()
        return vocab 
Example #27
Source File: data.py    From dynamic-training-with-apache-mxnet-on-aws with Apache License 2.0 5 votes vote down vote up
def _parse_file(self, file_name):
        logging.debug("Processing file: %s" % file_name)
        with codecs.open(file_name, "r", "utf-8") as f:
            lines = [line.strip() for line in f]
            if not self._shuffle:
                random.shuffle(lines)
            logging.debug("Finished processing!")
            for line in lines:
                yield self._parse_sentence(line) 
Example #28
Source File: load_base.py    From lyrebird-api-coverage with MIT License 5 votes vote down vote up
def get_file_sha1(path):
    with open(path, 'rb') as f:
        sha1obj = hashlib.sha1()
        sha1obj.update(f.read())
        hash_sha1 = sha1obj.hexdigest()
        return hash_sha1 
Example #29
Source File: fe_to_conll.py    From open-sesame with Apache License 2.0 5 votes vote down vote up
def join_google_fe_test_conll(conllfile, frames, tfdict1, sent1, outfile):
    with codecs.open(outfile, "w", "utf-8") as outf:
        with codecs.open(conllfile, "r", "utf-8") as cf:
            for l in cf:
                cfields = l.strip().split("\t")
                if len(cfields) == 1:
                    outf.write("\n")
                    continue

                if len(cfields) != 15:
                    raise Exception("incorrect CoNLL 2009 format", l, cfields)
                newfields = cfields[:12]

                sent_num = int(cfields[6])
                position = int(cfields[0])-1
                if sent_num in frames and position in frames[sent_num] and cfields[12] != gc.EMPTY_LABEL:
                    newfields.append(cfields[12]) # keep our LUs but use their frames.
                    newfields.append(frames[sent_num][position][0])
                    newfields.append(gc.EMPTY_FE)
                elif sent_num == sent1 and position in tfdict1:
                    newfields.append(cfields[12]) # keep our LUs but use their frames.
                    newfields.append(tfdict1[position][0])
                    newfields.append(gc.EMPTY_FE)
                else:
                    newfields += [gc.EMPTY_LABEL, gc.EMPTY_LABEL, gc.EMPTY_FE] # FILLPRED PRED APREDS = 12,13,14
                if len(newfields) != len(cfields):
                    raise Exception("didn't join properly", len(newfields), len(cfields), newfields)
                outf.write("\t".join(newfields) + "\n")
            cf.close()
        outf.close() 
Example #30
Source File: data_helpers.py    From dynamic-training-with-apache-mxnet-on-aws with Apache License 2.0 5 votes vote down vote up
def load_pretrained_word2vec(infile):
    if isinstance(infile, str):
        infile = open(infile)

    word2vec = {}
    for idx, line in enumerate(infile):
        if idx == 0:
            vocab_size, dim = line.strip().split()
        else:
            tks = line.strip().split()
            word2vec[tks[0]] = map(float, tks[1:])

    return word2vec