Python codecs.open() Examples
The following are 30
code examples of codecs.open().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
codecs
, or try the search function
.
Example #1
Source File: NLP.py From Financial-NLP with Apache License 2.0 | 8 votes |
def txt2sentence(self, filename): """ read a <cut_file> and return an iterator sentences (that is a list of some lists, and the second 'list' is a list of words ). """ sentences=[] try: fp=open(filename,'r',encoding='utf-8') lines=fp.readlines() except: fp=open(filename,'r',encoding='gbk') lines=fp.readlines() for line in lines: line = line.strip() if len(line)<=1: continue line=line.replace('\n','').replace('\r','').split(' ') sentences.append(line) return sentences
Example #2
Source File: NLP.py From Financial-NLP with Apache License 2.0 | 7 votes |
def loadWordNet(self): """ load zh_wordnet into the object. 将cow-not-full文件中的数据集整合成set """ f = codecs.open(self.wordnet_txt, "rb", "utf-8") self.known = dict() #self.known = set() for l in f: if l.startswith('\ufeff#') or not l.strip(): continue row = l.strip().split("\t") (synset,lemma)=row #if len(row) == 2: # (synset, lemma) = row #elif len(row) == 3: # (synset, lemma, status) = row #根本就没有三个东西的项 #else: # print("illformed line: ", l.strip()) #if not (synset.strip(), lemma.strip()) in self.known: # self.known.add((synset.strip(), lemma.strip())) if not lemma.strip() in self.known.keys(): self.known[lemma.strip()]=[] self.known[lemma.strip()].append(synset)
Example #3
Source File: preprocess.py From open-sesame with Apache License 2.0 | 6 votes |
def write_to_conll(outf, fsp, firstex, sentid): mode = "a" if firstex: mode = "w" with codecs.open(outf, mode, "utf-8") as outf: for i in xrange(fsp.sent.size()): token, postag, nltkpostag, nltklemma, lu, frm, role = fsp.info_at_idx(i) outf.write(str(i+1) + "\t") # ID = 0 outf.write(token.encode('utf-8') + "\t") # FORM = 1 outf.write("_\t" + nltklemma + "\t") # LEMMA PLEMMA = 2,3 outf.write(postag + "\t" + nltkpostag + "\t") # POS PPOS = 4,5 outf.write(str(sentid-1) + "\t_\t") # FEAT PFEAT = 6,7 ~ replacing FEAT with sentence number outf.write("_\t_\t") # HEAD PHEAD = 8,9 outf.write("_\t_\t") # DEPREL PDEPREL = 10,11 outf.write(lu + "\t" + frm + "\t") # FILLPRED PRED = 12,13 outf.write(role + "\n") #APREDS = 14 outf.write("\n") # end of sentence outf.close()
Example #4
Source File: utils.py From Att-ChemdNER with Apache License 2.0 | 6 votes |
def get_perf(filename): ''' run conlleval.pl perl script to obtain precision/recall and F1 score ''' _conlleval = PREFIX + 'conlleval' if not isfile(_conlleval): #download('http://www-etud.iro.umontreal.ca/~mesnilgr/atis/conlleval.pl') os.system('wget https://www.comp.nus.edu.sg/%7Ekanmy/courses/practicalNLP_2008/packages/conlleval.pl') chmod('conlleval.pl', stat.S_IRWXU) # give the execute permissions out = [] proc = subprocess.Popen(["perl", _conlleval], stdin=subprocess.PIPE, stdout=subprocess.PIPE) stdout, _ = proc.communicate(open(filename).read()) for line in stdout.split('\n'): if 'accuracy' in line: out = line.split() break # out = ['accuracy:', '16.26%;', 'precision:', '0.00%;', 'recall:', '0.00%;', 'FB1:', '0.00'] precision = float(out[3][:-2]) recall = float(out[5][:-2]) f1score = float(out[7]) return {'p':precision, 'r':recall, 'f1':f1score}
Example #5
Source File: trip-advisor-crawler.py From trip-advisor-crawler with GNU General Public License v3.0 | 6 votes |
def getreview(domain, cityid, activity, reviewid, timeout, maxretries, basepath, force, pause): baseurl = 'http://www.tripadvisor.' + domain + '/ShowUserReviews-g' reviewurl = '%s%s-d%s-r%s' % (baseurl, cityid, activity, reviewid) path = os.sep.join((basepath, domain, str(cityid), str(activity))) filename = os.sep.join((path, str(reviewid) + '.html')) if force or not os.path.exists(filename): htmlpage = download_page(reviewurl, maxretries, timeout, pause) if htmlpage is None: print('Error downloading the review URL: ' + reviewurl) else: if not os.path.exists(path): os.makedirs(path) with codecs.open(filename, mode='w', encoding='utf8') as file: file.write(htmlpage.decode('utf-8'))
Example #6
Source File: model.py From Att-ChemdNER with Apache License 2.0 | 6 votes |
def save_mappings(self, id_to_word, id_to_char, id_to_tag): #{{{ """ We need to save the mappings if we want to use the model later. """ self.id_to_word = id_to_word self.id_to_char = id_to_char self.id_to_tag = id_to_tag with open(self.mappings_path, 'wb') as f: mappings = { 'id_to_word': self.id_to_word, 'id_to_char': self.id_to_char, 'id_to_tag': self.id_to_tag, } cPickle.dump(mappings, f) #}}}
Example #7
Source File: bpe.py From keras-gpt-2 with MIT License | 6 votes |
def get_bpe_from_files(encoder_path, vocab_path): """Get initialized BPE. :param encoder_path: Path to 'encoder.json'. :param vocab_path: Path to 'vocab.bpe' :return: The object from encode and decode strings. """ with codecs.open(encoder_path, 'r', 'utf8') as reader: token_dict = json.load(reader) bpe_rank = {} with codecs.open(vocab_path, 'r', 'utf8') as reader: reader.readline() for rank, line in enumerate(reader): line = line.strip() if line: bpe_rank[tuple(line.split())] = rank return BytePairEncoding(token_dict, bpe_rank)
Example #8
Source File: utils.py From DOTA_models with Apache License 2.0 | 6 votes |
def bodpolyToRec(srcpath, dstpath): #dstpath = os.path.join(r'E:\bod-dataset\patches\subcategorylabel\results\ReclabelTxt') filelist = GetFileFromThisRootDir(srcpath) namelist = [mybasename(x.strip()) for x in filelist] for basename in namelist: # objects = parse_bod_poly(os.path.join(self.labelpath, basename + '.txt')) objects = parse_bod_poly(os.path.join(srcpath,basename + '.txt')) f_out = codecs.open(os.path.join(dstpath, basename + '.txt'), 'w', 'utf_16') for obj in objects: bbox = dots4ToRec8(obj['poly']) name = obj['name'] difficult = obj['difficult'] bbox = list(map(str, bbox)) outline = ' '.join(bbox) outline = outline + ' ' + name if difficult: outline = outline + ' ' + str(difficult) f_out.write(outline + '\n')
Example #9
Source File: loader.py From Att-ChemdNER with Apache License 2.0 | 6 votes |
def load_sentences(path, lower, zeros): #{{{ """ Load sentences. A line must contain at least a word and its tag. Sentences are separated by empty lines. """ sentences = [] sentence = [] for line in codecs.open(path, 'r', 'utf8'): line = zero_digits(line.rstrip()) if zeros else line.rstrip() if not line: if len(sentence) > 0: if 'DOCSTART' not in sentence[0][0]: sentences.append(sentence) sentence = [] else: word = line.split() assert len(word) >= 2 sentence.append(word) if len(sentence) > 0: if 'DOCSTART' not in sentence[0][0]: sentences.append(sentence) return sentences #}}}
Example #10
Source File: load_base.py From lyrebird-api-coverage with MIT License | 6 votes |
def auto_load_base(): lyrebird_conf = lyrebird.context.application.conf # 读取指定base文件,写入到base.json if lyrebird_conf.get('hunter.base'): base_path = lyrebird_conf.get('hunter.base') base = codecs.open(base_path, 'r', 'utf-8').read() f = codecs.open(DEFAULT_BASE, 'w', 'utf-8') f.write(base) f.close() app_context.base_sha1 = get_file_sha1(DEFAULT_BASE) return json.loads(base) # 通过本地默认base文件获取base elif not os.path.exists(DEFAULT_BASE): copy_file(DEFAULT_BASE) with codecs.open(DEFAULT_BASE, 'r', 'utf-8') as f: json_obj = json.load(f) app_context.base_sha1 = get_file_sha1(DEFAULT_BASE) return json_obj
Example #11
Source File: utils.py From DOTA_models with Apache License 2.0 | 6 votes |
def wordlabel2dark(self): filelist = GetFileFromThisRootDir(self.wordlabelpath) #print(filelist) for fullname in filelist: objects = parse_bod_poly(fullname) name = mybasename(fullname) with open(os.path.join(self.darkpath, name + '.txt'), 'w') as f_out: for obj in objects: poly = obj['poly'] bbox = np.array(dots4ToRecC(poly)) / 1024 ## note: the box is x_center, y_center, w, h, that means the whole box can be out of border if (str(obj['difficult']) == '1'): continue if (sum(bbox <= 0) + sum(bbox >= 1)) >= 1: continue if (obj['name'] in wordname_15): id = wordname_15.index(obj['name']) else: continue outline = str(id) + ' ' + ' '.join(list(map(str, bbox))) f_out.write(outline + '\n')
Example #12
Source File: utils.py From DOTA_models with Apache License 2.0 | 6 votes |
def TransTo15Word_gt(self): dstpath = r'wordlabel' self.ParseTxtAndWrite(self.labelpath, dstpath, datamap_15) # def TransTo15class(self, path): # filelist = GetFileFromThisRootDir(self.labelpath) # for fullname in filelist: # objects = parse_bod_poly2(fullname) # name = mybasename(fullname) # outname = os.path.join(self.basepath, path, name + '.txt') # f_out = codecs.open(outname, 'w', 'utf_16') # # for obj in objects: # if obj['name'] in classname_15: # if path == 'wordlabel': # outline = ' '.join(map(str, obj['poly'])) + ' ' + datamap_15[obj['name']] + ' ' + str(obj['difficult']) # print('outline:', outline) # #f_out.write(outline + '\n') # elif path == 'label15Txt': # outline = ' '.join(map(str, obj['poly'])) + ' ' + obj['name'] + ' ' + str(obj['difficult']) # print('outline:', outline) # f_out.write(outline + '\n')
Example #13
Source File: utils.py From DOTA_models with Apache License 2.0 | 6 votes |
def bodpolyToRec(self, label): Recpath = os.path.join(self.basepath, r'ReclabelTxt') for basename in self.namelist: # objects = parse_bod_poly(os.path.join(self.labelpath, basename + '.txt')) objects = parse_bod_poly(os.path.join(self.basepath, label, basename + '.txt')) f_out = codecs.open(os.path.join(Recpath, basename + '.txt'), 'w', 'utf_16') for obj in objects: bbox = dots4ToRec8(obj['poly']) name = obj['name'] difficult = obj['difficult'] bbox = list(map(str, bbox)) outline = ' '.join(bbox) outline = outline + ' ' + name if difficult: outline = outline + ' ' + str(difficult) f_out.write(outline + '\n')
Example #14
Source File: utils.py From DOTA_models with Apache License 2.0 | 6 votes |
def bod2darknet(subpath, label, extractclassname): labelpath = os.path.join(subpath, label) filelist = GetFileFromThisRootDir(labelpath) outpath = r'/home/dj/data/bod-subset/labels' for fullname in filelist: objects = parse_bod_poly(fullname) name = os.path.splitext(os.path.basename(fullname))[0] with open(os.path.join(outpath, name + '.txt'), 'w') as f_out: for obj in objects: poly = obj['poly'] bbox = np.array(dots4ToRecC(poly)) / 1024 if (sum(bbox <= 0) + sum(bbox >= 1)) >= 1: continue if (obj['name'] in extractclassname): id = extractclassname.index(obj['name']) else: continue outline = str(id) + ' ' + ' '.join(list(map(str, bbox))) f_out.write(outline + '\n')
Example #15
Source File: Self.py From CyberTK-Self with GNU General Public License v2.0 | 6 votes |
def sendVoice(self, to_, path): M = Message(to=to_, text=None, contentType = 3) M.contentPreview = None M_id = self._client.sendMessage(0,M).id files = { 'file': open(path, 'rb'), } params = { 'name': 'voice_message', 'oid': M_id, 'size': len(open(path, 'rb').read()), 'type': 'audio', 'ver': '1.0', } data = { 'params': json.dumps(params) } r = self.post_content('https://os.line.naver.jp/talk/m/upload.nhn', data=data, files=files) if r.status_code != 201: raise Exception('Upload voice failure.') return True
Example #16
Source File: setup.py From L.E.S.M.A with Apache License 2.0 | 6 votes |
def find_version(*file_paths): # Open in Latin-1 so that we avoid encoding errors. # Use codecs.open for Python 2 compatibility try: f = codecs.open(os.path.join(here, *file_paths), 'r', 'latin1') version_file = f.read() f.close() except: raise RuntimeError("Unable to find version string.") # The version line must have the form # __version__ = 'ver' version_match = re.search(r"^__version__ = ['\"]([^'\"]*)['\"]", version_file, re.M) if version_match: return version_match.group(1) raise RuntimeError("Unable to find version string.") # Get the long description from the relevant file
Example #17
Source File: Self.py From CyberTK-Self with GNU General Public License v2.0 | 6 votes |
def sendAudio(self, to_, path): M = Message(to=to_, text=None, contentType = 3) M_id = self.Talk.client.sendMessage(0,M).id files = { 'file': open(path, 'rb'), } params = { 'name': 'media', 'oid': M_id, 'size': len(open(path, 'rb').read()), 'type': 'audio', 'ver': '1.0', } data = { 'params': json.dumps(params) } r = self.post_content('https://os.line.naver.jp/talk/m/upload.nhn', data=data, files=files) print r if r.status_code != 201: raise Exception('Upload audio failure.')
Example #18
Source File: formatters.py From ciocheck with MIT License | 6 votes |
def _add_missing_init_py(self, paths): """Add missing __init__.py files in the module subdirectories.""" results = [] folders = [os.path.dirname(p) for p in paths] # Avoid adding an init on repo level if setup.py or other script on the # top level has changed if self.cmd_root in folders: folders.remove(self.cmd_root) for folder in folders: init_py = os.path.join(folder, "__init__.py") exists = os.path.exists(init_py) if not exists: with codecs.open(init_py, 'w', 'utf-8') as handle: handle.flush() result = { 'path': init_py, 'created': not exists, 'diff': diff('', ''), 'error': None, } results.append(result) return results
Example #19
Source File: Self.py From CyberTK-Self with GNU General Public License v2.0 | 6 votes |
def sendImage(self, to_, path): M = Message(to=to_,contentType = 1) M.contentMetadata = None M.contentPreview = None M_id = self.Talk.client.sendMessage(0,M).id files = { 'file': open(path, 'rb'), } params = { 'name': 'media', 'oid': M_id, 'size': len(open(path, 'rb').read()), 'type': 'image', 'ver': '1.0', } data = { 'params': json.dumps(params) } r = self.post_content('https://os.line.naver.jp/talk/m/upload.nhn', data=data, files=files) if r.status_code != 201: raise Exception('Upload image failure.') return True
Example #20
Source File: Segmentation.py From text-rank with MIT License | 6 votes |
def __init__(self, stop_words_file = None, allow_speech_tags = util.allow_speech_tags): """ Keyword arguments: stop_words_file -- 保存停止词的文件路径,utf8编码,每行一个停止词。若不是str类型,则使用默认的停止词 allow_speech_tags -- 词性列表,用于过滤 """ allow_speech_tags = [util.as_text(item) for item in allow_speech_tags] self.default_speech_tag_filter = allow_speech_tags self.stop_words = set() self.stop_words_file = get_default_stop_words_file() if type(stop_words_file) is str: self.stop_words_file = stop_words_file for word in codecs.open(self.stop_words_file, 'r', 'utf-8', 'ignore'): self.stop_words.add(word.strip())
Example #21
Source File: data_helpers.py From dynamic-training-with-apache-mxnet-on-aws with Apache License 2.0 | 6 votes |
def load_data_and_labels(): """ Loads MR polarity data from files, splits the data into words and generates labels. Returns split sentences and labels. """ # download dataset get_chinese_text() # Load data from files positive_examples = list(codecs.open("./data/pos.txt", "r", "utf-8").readlines()) positive_examples = [s.strip() for s in positive_examples] positive_examples = [pe for pe in positive_examples if len(pe) < 100] negative_examples = list(codecs.open("./data/neg.txt", "r", "utf-8").readlines()) negative_examples = [s.strip() for s in negative_examples] negative_examples = [ne for ne in negative_examples if len(ne) < 100] # Split by words x_text = positive_examples + negative_examples # x_text = [clean_str(sent) for sent in x_text] x_text = [list(s) for s in x_text] # Generate labels positive_labels = [[0, 1] for _ in positive_examples] negative_labels = [[1, 0] for _ in negative_examples] y = np.concatenate([positive_labels, negative_labels], 0) return [x_text, y]
Example #22
Source File: preprocess.py From open-sesame with Apache License 2.0 | 5 votes |
def filter_embeddings(embedding_files): """ Filters the embeddings file to retain only the vocabulary in the train, dev and test files. """ sys.stderr.write("\nReading FrameNet {} vocabulary...\n".format(VERSION)) vocab = set([]) corpora = [DEV_CONLL, TRAIN_FTE, TRAIN_EXEMPLAR, TEST_CONLL] for corpus in corpora: with codecs.open(corpus, "r", "utf-8") as cf: tokens = [line.split("\t")[1].lower() for line in cf if line != "\n"] cf.close() vocab.update(tokens) sys.stderr.write("\nTotal (train + dev + test) vocabulary size = {}\nFiltering out the word vectors ...".format(len(vocab))) for emb_file in embedding_files: embeddings_file = open(DATA_DIR + emb_file, 'r') new_embeddings_file = DATA_DIR.split(".txt")[0] + VERSION + ".framevocab.txt" filtered_embeddings = open(new_embeddings_file, 'w') num_embeddings = 0 for l in embeddings_file: fields = l.strip().split(' ') wd = fields[0].lower() if wd in vocab: filtered_embeddings.write(l) num_embeddings += 1 embeddings_file.close() filtered_embeddings.close() sys.stderr.write("\nTotal embeddings in {} = {}\n".format(new_embeddings_file, num_embeddings))
Example #23
Source File: utils.py From DOTA_models with Apache License 2.0 | 5 votes |
def GetListFromfile(fullname): with open(fullname, 'r') as f: lines = f.readlines() names = {x.strip() for x in lines} return names
Example #24
Source File: setup.py From tvdbsimple with GNU General Public License v3.0 | 5 votes |
def read(fname): here = path.join(path.abspath(path.dirname(__file__)), fname) txt = '' if (path.isfile(here)): # Get the long description from the README file with open(here, encoding='utf-8') as f: txt= f.read() return txt
Example #25
Source File: preprocess_syntax.py From open-sesame with Apache License 2.0 | 5 votes |
def join_fnconll_parseyconll(conllfile, synfile, outfile): with codecs.open(outfile, "w", "utf-8") as outf: with codecs.open(conllfile, "r", "utf-8") as cf: with codecs.open(synfile, "r", "utf-8") as sf: for l,sl in izip(cf,sf): cfields = l.strip().split("\t") if len(cfields) == 1: outf.write("\n") continue if len(cfields) != 15: raise Exception("incorrect CoNLL 2009 format", l, cfields) sfields = sl.strip().split("\t") if len(sfields) != 10: raise Exception("incorrect parsey CoNLL format") newfields = cfields[:4] # ID FORM LEMMA PLEMMA = 0,1,2,3 newfields += sfields[3:6:2] # syntaxnetPOS fnPOS = 4,5 ~ replacing POS PPOS newfields += cfields[6:9] # sent_num PFEAT HEAD = 6,7,8 ~ replacing FEAT PFEAT HEAD newfields += sfields[6:7] # syntaxnetHEAD = 9 ~ replacing PHEAD newfields += cfields[10:11] # DEPREL = 10 newfields += sfields[7:8] # syntaxnetDEPREL = 11 ~ replacing PDEPREL newfields += cfields[12:] # FILLPRED PRED APREDS = 12,13,14 if len(newfields) != len(cfields): raise Exception("didn't join properly", len(newfields), len(cfields), newfields) outf.write("\t".join(newfields) + "\n") sf.close() cf.close() outf.close()
Example #26
Source File: data.py From dynamic-training-with-apache-mxnet-on-aws with Apache License 2.0 | 5 votes |
def from_file(filename): vocab = Vocabulary() with codecs.open(filename, "r", "utf-8") as f: for line in f: word, count = line.strip().split() vocab.add(word, int(count)) vocab.finalize() return vocab
Example #27
Source File: data.py From dynamic-training-with-apache-mxnet-on-aws with Apache License 2.0 | 5 votes |
def _parse_file(self, file_name): logging.debug("Processing file: %s" % file_name) with codecs.open(file_name, "r", "utf-8") as f: lines = [line.strip() for line in f] if not self._shuffle: random.shuffle(lines) logging.debug("Finished processing!") for line in lines: yield self._parse_sentence(line)
Example #28
Source File: load_base.py From lyrebird-api-coverage with MIT License | 5 votes |
def get_file_sha1(path): with open(path, 'rb') as f: sha1obj = hashlib.sha1() sha1obj.update(f.read()) hash_sha1 = sha1obj.hexdigest() return hash_sha1
Example #29
Source File: fe_to_conll.py From open-sesame with Apache License 2.0 | 5 votes |
def join_google_fe_test_conll(conllfile, frames, tfdict1, sent1, outfile): with codecs.open(outfile, "w", "utf-8") as outf: with codecs.open(conllfile, "r", "utf-8") as cf: for l in cf: cfields = l.strip().split("\t") if len(cfields) == 1: outf.write("\n") continue if len(cfields) != 15: raise Exception("incorrect CoNLL 2009 format", l, cfields) newfields = cfields[:12] sent_num = int(cfields[6]) position = int(cfields[0])-1 if sent_num in frames and position in frames[sent_num] and cfields[12] != gc.EMPTY_LABEL: newfields.append(cfields[12]) # keep our LUs but use their frames. newfields.append(frames[sent_num][position][0]) newfields.append(gc.EMPTY_FE) elif sent_num == sent1 and position in tfdict1: newfields.append(cfields[12]) # keep our LUs but use their frames. newfields.append(tfdict1[position][0]) newfields.append(gc.EMPTY_FE) else: newfields += [gc.EMPTY_LABEL, gc.EMPTY_LABEL, gc.EMPTY_FE] # FILLPRED PRED APREDS = 12,13,14 if len(newfields) != len(cfields): raise Exception("didn't join properly", len(newfields), len(cfields), newfields) outf.write("\t".join(newfields) + "\n") cf.close() outf.close()
Example #30
Source File: data_helpers.py From dynamic-training-with-apache-mxnet-on-aws with Apache License 2.0 | 5 votes |
def load_pretrained_word2vec(infile): if isinstance(infile, str): infile = open(infile) word2vec = {} for idx, line in enumerate(infile): if idx == 0: vocab_size, dim = line.strip().split() else: tks = line.strip().split() word2vec[tks[0]] = map(float, tks[1:]) return word2vec