Python codecs.open() Examples
The following are 30 code examples for showing how to use codecs.open(). These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.
You may check out the related API usage on the sidebar.
You may also want to check out all available functions/classes of the module
codecs
, or try the search function
.
Example 1
Project: Financial-NLP Author: Coldog2333 File: NLP.py License: Apache License 2.0 | 7 votes |
def txt2sentence(self, filename): """ read a <cut_file> and return an iterator sentences (that is a list of some lists, and the second 'list' is a list of words ). """ sentences=[] try: fp=open(filename,'r',encoding='utf-8') lines=fp.readlines() except: fp=open(filename,'r',encoding='gbk') lines=fp.readlines() for line in lines: line = line.strip() if len(line)<=1: continue line=line.replace('\n','').replace('\r','').split(' ') sentences.append(line) return sentences
Example 2
Project: Financial-NLP Author: Coldog2333 File: NLP.py License: Apache License 2.0 | 6 votes |
def loadWordNet(self): """ load zh_wordnet into the object. 将cow-not-full文件中的数据集整合成set """ f = codecs.open(self.wordnet_txt, "rb", "utf-8") self.known = dict() #self.known = set() for l in f: if l.startswith('\ufeff#') or not l.strip(): continue row = l.strip().split("\t") (synset,lemma)=row #if len(row) == 2: # (synset, lemma) = row #elif len(row) == 3: # (synset, lemma, status) = row #根本就没有三个东西的项 #else: # print("illformed line: ", l.strip()) #if not (synset.strip(), lemma.strip()) in self.known: # self.known.add((synset.strip(), lemma.strip())) if not lemma.strip() in self.known.keys(): self.known[lemma.strip()]=[] self.known[lemma.strip()].append(synset)
Example 3
Project: Att-ChemdNER Author: lingluodlut File: utils.py License: Apache License 2.0 | 6 votes |
def get_perf(filename): ''' run conlleval.pl perl script to obtain precision/recall and F1 score ''' _conlleval = PREFIX + 'conlleval' if not isfile(_conlleval): #download('http://www-etud.iro.umontreal.ca/~mesnilgr/atis/conlleval.pl') os.system('wget https://www.comp.nus.edu.sg/%7Ekanmy/courses/practicalNLP_2008/packages/conlleval.pl') chmod('conlleval.pl', stat.S_IRWXU) # give the execute permissions out = [] proc = subprocess.Popen(["perl", _conlleval], stdin=subprocess.PIPE, stdout=subprocess.PIPE) stdout, _ = proc.communicate(open(filename).read()) for line in stdout.split('\n'): if 'accuracy' in line: out = line.split() break # out = ['accuracy:', '16.26%;', 'precision:', '0.00%;', 'recall:', '0.00%;', 'FB1:', '0.00'] precision = float(out[3][:-2]) recall = float(out[5][:-2]) f1score = float(out[7]) return {'p':precision, 'r':recall, 'f1':f1score}
Example 4
Project: Att-ChemdNER Author: lingluodlut File: model.py License: Apache License 2.0 | 6 votes |
def save_mappings(self, id_to_word, id_to_char, id_to_tag): #{{{ """ We need to save the mappings if we want to use the model later. """ self.id_to_word = id_to_word self.id_to_char = id_to_char self.id_to_tag = id_to_tag with open(self.mappings_path, 'wb') as f: mappings = { 'id_to_word': self.id_to_word, 'id_to_char': self.id_to_char, 'id_to_tag': self.id_to_tag, } cPickle.dump(mappings, f) #}}}
Example 5
Project: Att-ChemdNER Author: lingluodlut File: loader.py License: Apache License 2.0 | 6 votes |
def load_sentences(path, lower, zeros): #{{{ """ Load sentences. A line must contain at least a word and its tag. Sentences are separated by empty lines. """ sentences = [] sentence = [] for line in codecs.open(path, 'r', 'utf8'): line = zero_digits(line.rstrip()) if zeros else line.rstrip() if not line: if len(sentence) > 0: if 'DOCSTART' not in sentence[0][0]: sentences.append(sentence) sentence = [] else: word = line.split() assert len(word) >= 2 sentence.append(word) if len(sentence) > 0: if 'DOCSTART' not in sentence[0][0]: sentences.append(sentence) return sentences #}}}
Example 6
Project: L.E.S.M.A Author: NatanaelAntonioli File: setup.py License: Apache License 2.0 | 6 votes |
def find_version(*file_paths): # Open in Latin-1 so that we avoid encoding errors. # Use codecs.open for Python 2 compatibility try: f = codecs.open(os.path.join(here, *file_paths), 'r', 'latin1') version_file = f.read() f.close() except: raise RuntimeError("Unable to find version string.") # The version line must have the form # __version__ = 'ver' version_match = re.search(r"^__version__ = ['\"]([^'\"]*)['\"]", version_file, re.M) if version_match: return version_match.group(1) raise RuntimeError("Unable to find version string.") # Get the long description from the relevant file
Example 7
Project: ciocheck Author: ContinuumIO File: formatters.py License: MIT License | 6 votes |
def _add_missing_init_py(self, paths): """Add missing __init__.py files in the module subdirectories.""" results = [] folders = [os.path.dirname(p) for p in paths] # Avoid adding an init on repo level if setup.py or other script on the # top level has changed if self.cmd_root in folders: folders.remove(self.cmd_root) for folder in folders: init_py = os.path.join(folder, "__init__.py") exists = os.path.exists(init_py) if not exists: with codecs.open(init_py, 'w', 'utf-8') as handle: handle.flush() result = { 'path': init_py, 'created': not exists, 'diff': diff('', ''), 'error': None, } results.append(result) return results
Example 8
Project: text-rank Author: ouprince File: Segmentation.py License: MIT License | 6 votes |
def __init__(self, stop_words_file = None, allow_speech_tags = util.allow_speech_tags): """ Keyword arguments: stop_words_file -- 保存停止词的文件路径,utf8编码,每行一个停止词。若不是str类型,则使用默认的停止词 allow_speech_tags -- 词性列表,用于过滤 """ allow_speech_tags = [util.as_text(item) for item in allow_speech_tags] self.default_speech_tag_filter = allow_speech_tags self.stop_words = set() self.stop_words_file = get_default_stop_words_file() if type(stop_words_file) is str: self.stop_words_file = stop_words_file for word in codecs.open(self.stop_words_file, 'r', 'utf-8', 'ignore'): self.stop_words.add(word.strip())
Example 9
Project: open-sesame Author: swabhs File: preprocess.py License: Apache License 2.0 | 6 votes |
def write_to_conll(outf, fsp, firstex, sentid): mode = "a" if firstex: mode = "w" with codecs.open(outf, mode, "utf-8") as outf: for i in xrange(fsp.sent.size()): token, postag, nltkpostag, nltklemma, lu, frm, role = fsp.info_at_idx(i) outf.write(str(i+1) + "\t") # ID = 0 outf.write(token.encode('utf-8') + "\t") # FORM = 1 outf.write("_\t" + nltklemma + "\t") # LEMMA PLEMMA = 2,3 outf.write(postag + "\t" + nltkpostag + "\t") # POS PPOS = 4,5 outf.write(str(sentid-1) + "\t_\t") # FEAT PFEAT = 6,7 ~ replacing FEAT with sentence number outf.write("_\t_\t") # HEAD PHEAD = 8,9 outf.write("_\t_\t") # DEPREL PDEPREL = 10,11 outf.write(lu + "\t" + frm + "\t") # FILLPRED PRED = 12,13 outf.write(role + "\n") #APREDS = 14 outf.write("\n") # end of sentence outf.close()
Example 10
Project: dynamic-training-with-apache-mxnet-on-aws Author: awslabs File: data_helpers.py License: Apache License 2.0 | 6 votes |
def load_data_and_labels(): """ Loads MR polarity data from files, splits the data into words and generates labels. Returns split sentences and labels. """ # download dataset get_chinese_text() # Load data from files positive_examples = list(codecs.open("./data/pos.txt", "r", "utf-8").readlines()) positive_examples = [s.strip() for s in positive_examples] positive_examples = [pe for pe in positive_examples if len(pe) < 100] negative_examples = list(codecs.open("./data/neg.txt", "r", "utf-8").readlines()) negative_examples = [s.strip() for s in negative_examples] negative_examples = [ne for ne in negative_examples if len(ne) < 100] # Split by words x_text = positive_examples + negative_examples # x_text = [clean_str(sent) for sent in x_text] x_text = [list(s) for s in x_text] # Generate labels positive_labels = [[0, 1] for _ in positive_examples] negative_labels = [[1, 0] for _ in negative_examples] y = np.concatenate([positive_labels, negative_labels], 0) return [x_text, y]
Example 11
Project: CyberTK-Self Author: CyberTKR File: Self.py License: GNU General Public License v2.0 | 6 votes |
def sendImage(self, to_, path): M = Message(to=to_,contentType = 1) M.contentMetadata = None M.contentPreview = None M_id = self.Talk.client.sendMessage(0,M).id files = { 'file': open(path, 'rb'), } params = { 'name': 'media', 'oid': M_id, 'size': len(open(path, 'rb').read()), 'type': 'image', 'ver': '1.0', } data = { 'params': json.dumps(params) } r = self.post_content('https://os.line.naver.jp/talk/m/upload.nhn', data=data, files=files) if r.status_code != 201: raise Exception('Upload image failure.') return True
Example 12
Project: CyberTK-Self Author: CyberTKR File: Self.py License: GNU General Public License v2.0 | 6 votes |
def sendAudio(self, to_, path): M = Message(to=to_, text=None, contentType = 3) M_id = self.Talk.client.sendMessage(0,M).id files = { 'file': open(path, 'rb'), } params = { 'name': 'media', 'oid': M_id, 'size': len(open(path, 'rb').read()), 'type': 'audio', 'ver': '1.0', } data = { 'params': json.dumps(params) } r = self.post_content('https://os.line.naver.jp/talk/m/upload.nhn', data=data, files=files) print r if r.status_code != 201: raise Exception('Upload audio failure.')
Example 13
Project: CyberTK-Self Author: CyberTKR File: Self.py License: GNU General Public License v2.0 | 6 votes |
def sendVoice(self, to_, path): M = Message(to=to_, text=None, contentType = 3) M.contentPreview = None M_id = self._client.sendMessage(0,M).id files = { 'file': open(path, 'rb'), } params = { 'name': 'voice_message', 'oid': M_id, 'size': len(open(path, 'rb').read()), 'type': 'audio', 'ver': '1.0', } data = { 'params': json.dumps(params) } r = self.post_content('https://os.line.naver.jp/talk/m/upload.nhn', data=data, files=files) if r.status_code != 201: raise Exception('Upload voice failure.') return True
Example 14
Project: DOTA_models Author: ringringyi File: utils.py License: Apache License 2.0 | 6 votes |
def bod2darknet(subpath, label, extractclassname): labelpath = os.path.join(subpath, label) filelist = GetFileFromThisRootDir(labelpath) outpath = r'/home/dj/data/bod-subset/labels' for fullname in filelist: objects = parse_bod_poly(fullname) name = os.path.splitext(os.path.basename(fullname))[0] with open(os.path.join(outpath, name + '.txt'), 'w') as f_out: for obj in objects: poly = obj['poly'] bbox = np.array(dots4ToRecC(poly)) / 1024 if (sum(bbox <= 0) + sum(bbox >= 1)) >= 1: continue if (obj['name'] in extractclassname): id = extractclassname.index(obj['name']) else: continue outline = str(id) + ' ' + ' '.join(list(map(str, bbox))) f_out.write(outline + '\n')
Example 15
Project: DOTA_models Author: ringringyi File: utils.py License: Apache License 2.0 | 6 votes |
def bodpolyToRec(self, label): Recpath = os.path.join(self.basepath, r'ReclabelTxt') for basename in self.namelist: # objects = parse_bod_poly(os.path.join(self.labelpath, basename + '.txt')) objects = parse_bod_poly(os.path.join(self.basepath, label, basename + '.txt')) f_out = codecs.open(os.path.join(Recpath, basename + '.txt'), 'w', 'utf_16') for obj in objects: bbox = dots4ToRec8(obj['poly']) name = obj['name'] difficult = obj['difficult'] bbox = list(map(str, bbox)) outline = ' '.join(bbox) outline = outline + ' ' + name if difficult: outline = outline + ' ' + str(difficult) f_out.write(outline + '\n')
Example 16
Project: DOTA_models Author: ringringyi File: utils.py License: Apache License 2.0 | 6 votes |
def TransTo15Word_gt(self): dstpath = r'wordlabel' self.ParseTxtAndWrite(self.labelpath, dstpath, datamap_15) # def TransTo15class(self, path): # filelist = GetFileFromThisRootDir(self.labelpath) # for fullname in filelist: # objects = parse_bod_poly2(fullname) # name = mybasename(fullname) # outname = os.path.join(self.basepath, path, name + '.txt') # f_out = codecs.open(outname, 'w', 'utf_16') # # for obj in objects: # if obj['name'] in classname_15: # if path == 'wordlabel': # outline = ' '.join(map(str, obj['poly'])) + ' ' + datamap_15[obj['name']] + ' ' + str(obj['difficult']) # print('outline:', outline) # #f_out.write(outline + '\n') # elif path == 'label15Txt': # outline = ' '.join(map(str, obj['poly'])) + ' ' + obj['name'] + ' ' + str(obj['difficult']) # print('outline:', outline) # f_out.write(outline + '\n')
Example 17
Project: DOTA_models Author: ringringyi File: utils.py License: Apache License 2.0 | 6 votes |
def wordlabel2dark(self): filelist = GetFileFromThisRootDir(self.wordlabelpath) #print(filelist) for fullname in filelist: objects = parse_bod_poly(fullname) name = mybasename(fullname) with open(os.path.join(self.darkpath, name + '.txt'), 'w') as f_out: for obj in objects: poly = obj['poly'] bbox = np.array(dots4ToRecC(poly)) / 1024 ## note: the box is x_center, y_center, w, h, that means the whole box can be out of border if (str(obj['difficult']) == '1'): continue if (sum(bbox <= 0) + sum(bbox >= 1)) >= 1: continue if (obj['name'] in wordname_15): id = wordname_15.index(obj['name']) else: continue outline = str(id) + ' ' + ' '.join(list(map(str, bbox))) f_out.write(outline + '\n')
Example 18
Project: DOTA_models Author: ringringyi File: utils.py License: Apache License 2.0 | 6 votes |
def bodpolyToRec(srcpath, dstpath): #dstpath = os.path.join(r'E:\bod-dataset\patches\subcategorylabel\results\ReclabelTxt') filelist = GetFileFromThisRootDir(srcpath) namelist = [mybasename(x.strip()) for x in filelist] for basename in namelist: # objects = parse_bod_poly(os.path.join(self.labelpath, basename + '.txt')) objects = parse_bod_poly(os.path.join(srcpath,basename + '.txt')) f_out = codecs.open(os.path.join(dstpath, basename + '.txt'), 'w', 'utf_16') for obj in objects: bbox = dots4ToRec8(obj['poly']) name = obj['name'] difficult = obj['difficult'] bbox = list(map(str, bbox)) outline = ' '.join(bbox) outline = outline + ' ' + name if difficult: outline = outline + ' ' + str(difficult) f_out.write(outline + '\n')
Example 19
Project: trip-advisor-crawler Author: aesuli File: trip-advisor-crawler.py License: GNU General Public License v3.0 | 6 votes |
def getreview(domain, cityid, activity, reviewid, timeout, maxretries, basepath, force, pause): baseurl = 'http://www.tripadvisor.' + domain + '/ShowUserReviews-g' reviewurl = '%s%s-d%s-r%s' % (baseurl, cityid, activity, reviewid) path = os.sep.join((basepath, domain, str(cityid), str(activity))) filename = os.sep.join((path, str(reviewid) + '.html')) if force or not os.path.exists(filename): htmlpage = download_page(reviewurl, maxretries, timeout, pause) if htmlpage is None: print('Error downloading the review URL: ' + reviewurl) else: if not os.path.exists(path): os.makedirs(path) with codecs.open(filename, mode='w', encoding='utf8') as file: file.write(htmlpage.decode('utf-8'))
Example 20
Project: keras-gpt-2 Author: CyberZHG File: bpe.py License: MIT License | 6 votes |
def get_bpe_from_files(encoder_path, vocab_path): """Get initialized BPE. :param encoder_path: Path to 'encoder.json'. :param vocab_path: Path to 'vocab.bpe' :return: The object from encode and decode strings. """ with codecs.open(encoder_path, 'r', 'utf8') as reader: token_dict = json.load(reader) bpe_rank = {} with codecs.open(vocab_path, 'r', 'utf8') as reader: reader.readline() for rank, line in enumerate(reader): line = line.strip() if line: bpe_rank[tuple(line.split())] = rank return BytePairEncoding(token_dict, bpe_rank)
Example 21
Project: lyrebird-api-coverage Author: Meituan-Dianping File: load_base.py License: MIT License | 6 votes |
def auto_load_base(): lyrebird_conf = lyrebird.context.application.conf # 读取指定base文件,写入到base.json if lyrebird_conf.get('hunter.base'): base_path = lyrebird_conf.get('hunter.base') base = codecs.open(base_path, 'r', 'utf-8').read() f = codecs.open(DEFAULT_BASE, 'w', 'utf-8') f.write(base) f.close() app_context.base_sha1 = get_file_sha1(DEFAULT_BASE) return json.loads(base) # 通过本地默认base文件获取base elif not os.path.exists(DEFAULT_BASE): copy_file(DEFAULT_BASE) with codecs.open(DEFAULT_BASE, 'r', 'utf-8') as f: json_obj = json.load(f) app_context.base_sha1 = get_file_sha1(DEFAULT_BASE) return json_obj
Example 22
Project: mutatest Author: EvanKepner File: conf.py License: MIT License | 5 votes |
def read(*parts): """ Build an absolute path from *parts* and and return the contents of the resulting file. Assume UTF-8 encoding. """ with codecs.open(os.path.join(HERE, *parts), "rb", "utf-8") as f: return f.read()
Example 23
Project: mutatest Author: EvanKepner File: setup.py License: MIT License | 5 votes |
def read(*parts): """ Build an absolute path from *parts* and and return the contents of the resulting file. Assume UTF-8 encoding. """ with codecs.open(os.path.join(HERE, *parts), "rb", "utf-8") as f: return f.read()
Example 24
Project: Financial-NLP Author: Coldog2333 File: NLP.py License: Apache License 2.0 | 5 votes |
def loadstopwords(self): """ load stopwords into the object. """ self.stop_words=list() stop_f=open(self.stopwords_txt,'r',encoding='utf-8') for line in stop_f.readlines(): line=line.strip() if not len(line): continue self.stop_words.append(line) stop_f.close()
Example 25
Project: Financial-NLP Author: Coldog2333 File: NLP.py License: Apache License 2.0 | 5 votes |
def txt2wordbag(self, origin_file, cutflag=False, remove_stopwords=True): #testing """ please remember to set a corresponding processing file. """ if origin_file.split('.')[0][-3:]!='cut': cut_file=self.cut(origin_file, remove_stopwords=True, swith_to_newtxt=True) else: cut_file=origin_file try: fp=open(cut_file,'r',encoding='utf-8') rawtxt=fp.read() except: fp=open(cut_file,'r',encoding='gbk') rawtxt=fp.read() words_list=rawtxt.split(' ') new_words_list=[] for word in words_list: if word=='' or (ord(word[0])<1024): continue else: new_words_list.append(word) if new_words_list=='\u3000': return new_words_list[1:] else: return new_words_list
Example 26
Project: Att-ChemdNER Author: lingluodlut File: tagger.py License: Apache License 2.0 | 5 votes |
def load_sentences(path): sentences = [] for line in codecs.open(path, 'r', 'utf8'): sentence =[]; line = line.rstrip() if line: word = line.split() for elem in word: sentence.append([elem]); sentences.append(sentence) return sentences
Example 27
Project: Att-ChemdNER Author: lingluodlut File: utils.py License: Apache License 2.0 | 5 votes |
def findNotSame(fNameX,fNameY): #{{{ """ verify two file is same or not """ space='space'; def loadFile(fName): word=[]; import codecs; for line in codecs.open(fName,'r','utf8'): line=line.rstrip(); if len(line)>0: word.append(line[0]); else: word.append(space); return word; word1=loadFile(fNameX); word2=loadFile(fNameY); i=0; j=0; while i<len(word1) and j<len(word2): if word1[i]==word2[j]: i+=1; j+=1; continue; elif word1[i] ==space: i+=1; elif word2[j]==space: j+=1; else: print "not same,X:",word1[i],",line:",i,',Y:',word2[j],',line:',j; break; #}}}
Example 28
Project: Att-ChemdNER Author: lingluodlut File: utils.py License: Apache License 2.0 | 5 votes |
def evaluate(parameters, f_eval, raw_sentences, parsed_sentences, id_to_tag, dictionary_tags,filename, useAttend=True): #{{{ """ Evaluate current model using CoNLL script. """ n_tags = len(id_to_tag) predictions = [] count = np.zeros((n_tags, n_tags), dtype=np.int32) for raw_sentence, data in zip(raw_sentences, parsed_sentences): input = create_input(data, parameters, False,useAttend=useAttend) if parameters['crf']: y_preds = np.array(f_eval(*input)) else: y_preds = f_eval(*input).argmax(axis=1) y_reals = np.array(data['tags']).astype(np.int32) assert len(y_preds) == len(y_reals) p_tags = [id_to_tag[y_pred] for y_pred in y_preds] r_tags = [id_to_tag[y_real] for y_real in y_reals] if parameters['tag_scheme'] == 'iobes': p_tags = iobes_iob(p_tags) r_tags = iobes_iob(r_tags) for i, (y_pred, y_real) in enumerate(zip(y_preds, y_reals)): new_line = " ".join(raw_sentence[i][:-1] + [r_tags[i], p_tags[i]]) predictions.append(new_line) count[y_real, y_pred] += 1 predictions.append("") #write to file with codecs.open(filename, 'w', 'utf8') as f: f.write("\n".join(predictions)) return get_perf(filename) #}}}
Example 29
Project: Att-ChemdNER Author: lingluodlut File: Atten_tagger.py License: Apache License 2.0 | 5 votes |
def load_sentences(path): sentences = [] for line in codecs.open(path, 'r', 'utf8'): sentence =[]; line = line.rstrip() if line: word = line.split() for elem in word: sentence.append([elem]); sentences.append(sentence) return sentences
Example 30
Project: Att-ChemdNER Author: lingluodlut File: model.py License: Apache License 2.0 | 5 votes |
def reload_mappings(self): #{{{ """ Load mappings from disk. """ with open(self.mappings_path, 'rb') as f: mappings = cPickle.load(f) self.id_to_word = mappings['id_to_word'] self.id_to_char = mappings['id_to_char'] self.id_to_tag = mappings['id_to_tag'] #}}}