Python luigi.run() Examples
The following are 30
code examples of luigi.run().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
luigi
, or try the search function
.
Example #1
Source File: repdocs.py From dblp with MIT License | 6 votes |
def run(self): paper_repdocs_file, author_file = self.input() with paper_repdocs_file.open() as pfile: paper_df = pd.read_csv(pfile, index_col=(0,)) paper_df.fillna('', inplace=True) # read out authorship records with author_file.open() as afile: author_df = pd.read_csv(afile, header=0, index_col=(0,)) # initialize repdoc dictionary from complete list of person ids author_ids = author_df.index.unique() repdocs = {i: [] for i in author_ids} # build up repdocs for each author for person_id, paper_id in author_df.itertuples(): doc = paper_df.loc[paper_id]['doc'] repdocs[person_id].append(doc) # save repdocs rows = ((person_id, '|'.join(docs)) for person_id, docs in repdocs.iteritems()) util.write_csv_to_fwrapper(self.output(), ('author_id', 'doc'), rows)
Example #2
Source File: filtering.py From dblp with MIT License | 6 votes |
def run(self): papers_file, refs_file = self.input() paper_out, refs_out = self.output() with papers_file.open() as pfile: papers_df = pd.read_csv(pfile) # Filter based on range of years papers_df['year'] = papers_df['year'].astype(int) filtered = papers_df[(papers_df['year'] >= self.start) & (papers_df['year'] <= self.end)] # Save filtered paper records with paper_out.open('w') as outfile: filtered.to_csv(outfile, index=False) paper_ids = filtered['id'].unique() # Filter and save references based on paper ids. with refs_file.open() as rfile: refs_df = pd.read_csv(rfile) filtered = refs_df[(refs_df['paper_id'].isin(paper_ids)) & (refs_df['ref_id'].isin(paper_ids))] with refs_out.open('w') as outfile: filtered.to_csv(outfile, index=False)
Example #3
Source File: wordcount.py From HadoopWithPython with MIT License | 6 votes |
def run(self): """ The task's logic """ count = {} ifp = self.input().open('r') for line in ifp: for word in line.strip().split(): count[word] = count.get(word, 0) + 1 ofp = self.output().open('w') for k, v in count.items(): ofp.write('{}\t{}\n'.format(k, v)) ofp.close()
Example #4
Source File: convert.py From dblp with MIT License | 6 votes |
def run(self): corpus = gensim.corpora.MmCorpus(self.input().path) with self.output().open('w') as wf: docs_with_tf = ( (docnum, corpus.docbyoffset(offset)) for docnum, offset in enumerate(corpus.index)) docs_as_pairs = ( zip([docnum] * len(doc), [term_id for term_id, _ in doc]) for docnum, doc in docs_with_tf) docs_as_lines = ( ['%s\t%s' % (docnum, termid) for docnum, termid in pairs] for pairs in docs_as_pairs) docs = ('\n'.join(lines) for lines in docs_as_lines) for doc in docs: wf.write('%s\n' % doc)
Example #5
Source File: build_graphs.py From dblp with MIT License | 6 votes |
def run(self): graphml_outfile, edgelist_outfile, idmap_outfile = self.output() author_graph_file, _ = self.input() # Read graph, find LCC, and save as graphml and edgelist authorg = igraph.Graph.Read_GraphMLz(author_graph_file.path) components = authorg.components() lcc = components.giant() lcc.write_graphmlz(graphml_outfile.path) lcc.write_edgelist(edgelist_outfile.path) # Build and save id map. idmap = {v['name']: v.index for v in lcc.vs} rows = sorted(idmap.items()) util.write_csv_to_fwrapper( idmap_outfile, ('author_id', 'node_id'), rows)
Example #6
Source File: build_graphs.py From dblp with MIT License | 5 votes |
def run(self): lcc_pickle_file, venue_map_file = self.input() # Read in the LCC graph lcc = igraph.Graph.Read_Picklez(lcc_pickle_file.path) # Build the community mapping: # each venue id is mapped to one or more node ids (the community) records = util.iter_csv_fwrapper(venue_map_file) communities = {int(venue_id): [] for venue_id, _ in records} for v in lcc.vs: for venue_id in v['venues']: communities[venue_id].append(v.index) # retrieve output files by_venue_file, by_author_file = self.output() # save ground truth communities comms = sorted(communities.items()) rows = (' '.join(map(str, comm)) for comm_num, comm in comms) with by_venue_file.open('w') as f: f.write('\n'.join(rows)) # save venue info for each author separately records = sorted([(v.index, v['venues']) for v in lcc.vs]) rows = (' '.join(map(str, venues)) for node_id, venues in records) with by_author_file.open('w') as f: f.write('\n'.join(rows))
Example #7
Source File: filtering.py From dblp with MIT License | 5 votes |
def run(self): with self.input().open() as paper_file: df = pd.read_csv(paper_file) multiple = df.groupby('venue')['venue'].transform(len) > 1 filtered = df[multiple] with self.output().open('w') as outfile: filtered.to_csv(outfile, index=False)
Example #8
Source File: filtering.py From dblp with MIT License | 5 votes |
def run(self): with self.papers_file.open() as pfile: paper_df = pd.read_csv(pfile, header=0, usecols=(0,2)) unique_venues = paper_df['venue'].unique() with self.output().open('w') as afile: afile.write('\n'.join(unique_venues))
Example #9
Source File: filtering.py From dblp with MIT License | 5 votes |
def run(self): paper_ids = self.read_paper_ids() with self.author_file.open() as afile: author_df = pd.read_csv(afile) # Filter and write authorship records. filtered = author_df[author_df['paper_id'].isin(paper_ids)] with self.output().open('w') as outfile: filtered.to_csv(outfile, index=False)
Example #10
Source File: build_graphs.py From dblp with MIT License | 5 votes |
def run(self): refg = igraph.Graph() nodes = self.read_paper_vertices() refg.add_vertices(nodes) # Build and save paper id to node id mapping idmap = {str(v['name']): v.index for v in refg.vs} rows = sorted(idmap.items()) util.write_csv_to_fwrapper( self.idmap_output_file, ('paper_id', 'node_id'), rows) # Now add venues to nodes as paper attributes for paper_id, venue in self.read_paper_venues(): node_id = idmap[paper_id] refg.vs[node_id]['venue'] = venue # next add author ids for v in refg.vs: v['author_ids'] = [] for author_id, paper_id in util.iter_csv_fwrapper(self.author_file): node_id = idmap[paper_id] refg.vs[node_id]['author_ids'].append(author_id) # Finally add edges from citation records citation_links = self.read_paper_references(idmap) refg.add_edges(citation_links) # Save in both pickle and graphml formats refg.write_picklez(self.pickle_output_file.path) refg.write_graphmlz(self.graphml_output_file.path) return refg
Example #11
Source File: build_graphs.py From dblp with MIT License | 5 votes |
def run(self): nodes = self.read_author_ids() edges = self.get_edges() authorg = util.build_undirected_graph(nodes, edges) # Now write the graph to gzipped graphml file. graph_output_file, idmap_output_file = self.output() authorg.write_graphmlz(graph_output_file.path) # Finally, build and save the ID map. idmap = {v['name']: v.index for v in authorg.vs} rows = sorted(idmap.items()) util.write_csv_to_fwrapper( idmap_output_file, ('author_id', 'node_id'), rows)
Example #12
Source File: build_graphs.py From dblp with MIT License | 5 votes |
def run(self): graph_file, idmap_file, paper_file, author_file = self.input() # Read in dependencies lcc = igraph.Graph.Read_GraphMLz(graph_file.path) author_venue_df = self.build_linked_venue_frame() venue_map = self.assign_venue_ids(author_venue_df) records = util.iter_csv_fwrapper(idmap_file) lcc_idmap = {record[0]: int(record[1]) for record in records} # Use sets in order to ensure uniqueness. for v in lcc.vs: v['venues'] = set() # Add the venue IDs to the node venue sets. for rownum, (author_id, venue) in author_venue_df.iterrows(): node_id = lcc_idmap[str(author_id)] venue_id = venue_map[venue] lcc.vs[node_id]['venues'].add(venue_id) # Convert the sets to tuples. for v in lcc.vs: v['venues'] = tuple(v['venues']) # save a copy of the graph with venue info pickle_outfile, venue_map_outfile = self.output() lcc.write_picklez(pickle_outfile.path) # lcc-author-citation-graph rows = ((vnum, venue) for venue, vnum in venue_map.iteritems()) util.write_csv_to_fwrapper( venue_map_outfile, ('venue_id', 'venue_name'), rows)
Example #13
Source File: example1.py From sciluigi with MIT License | 5 votes |
def run(self): cmd = 'cat ' + self.in_data().path + ' | sed "s/A/T/g" > ' + self.out_replatot().path log.info("COMMAND TO EXECUTE: " + cmd) call(cmd, shell=True) # Run this file as script # ------------------------------------------------------------------------
Example #14
Source File: aminer.py From dblp with MIT License | 5 votes |
def run(self): # find venue column with self.input().open() as papers_file: reader = csv.reader(papers_file) headers = reader.next() venue_index = headers.index('venue') # filter out unique venues with self.input().open() as papers_file: df = pd.read_csv(papers_file, header=0, usecols=(venue_index,)) unique_venues = df['venue'].unique() # write to csv file with self.output().open('w') as outfile: outfile.write('\n'.join(map(str, unique_venues)))
Example #15
Source File: aminer.py From dblp with MIT License | 5 votes |
def run(self): # find year column with self.input().open() as papers_file: reader = csv.reader(papers_file) headers = reader.next() year_index = headers.index('year') # filter out unique years with self.input().open() as papers_file: df = pd.read_csv(papers_file, header=0, usecols=(year_index,)) unique_years = df['year'].unique() # write to csv file with self.output().open('w') as outfile: outfile.write('\n'.join(map(str, unique_years)))
Example #16
Source File: aminer.py From dblp with MIT License | 5 votes |
def run(self): author_rows = self.read_author_id_name_pairs() util.write_csv_to_fwrapper(self.output(), ('id', 'name'), author_rows)
Example #17
Source File: aminer.py From dblp with MIT License | 5 votes |
def run(self): authorships = self.iter_authorships() util.write_csv_to_fwrapper( self.output(), ('author_id', 'paper_id'), authorships)
Example #18
Source File: interface.py From sciluigi with MIT License | 5 votes |
def run(*args, **kwargs): ''' Forwarding luigi's run method ''' luigi.run(*args, **kwargs)
Example #19
Source File: interface.py From sciluigi with MIT License | 5 votes |
def run_local(*args, **kwargs): ''' Forwarding luigi's run method, with local scheduler ''' run(local_scheduler=True, *args, **kwargs)
Example #20
Source File: interface.py From sciluigi with MIT License | 5 votes |
def run(*args, **kwargs): ''' Forwarding luigi's run method ''' luigi.run(*args, **kwargs)
Example #21
Source File: interface.py From sciluigi with MIT License | 5 votes |
def run_local(*args, **kwargs): ''' Forwarding luigi's run method, with local scheduler ''' run(local_scheduler=True, *args, **kwargs)
Example #22
Source File: tasks.py From luigi-td with Apache License 2.0 | 5 votes |
def run(self): result = self.run_query(self.query()) print '====================' print "Job ID :", result.job_id print "Result size:", result.size print "Result :" print "\t".join([c[0] for c in result.description]) print "----" for row in result: print "\t".join([str(c) for c in row]) print '===================='
Example #23
Source File: newspapers.py From gluish with GNU General Public License v3.0 | 5 votes |
def run(self): """ Construct the document id from the date and the url. """ document = {} document['_id'] = hashlib.sha1('%s:%s' % ( self.date, self.url)).hexdigest() with self.input().open() as handle: document['content'] = handle.read().decode('utf-8', 'ignore') document['url'] = self.url document['date'] = unicode(self.date) with self.output().open('w') as output: output.write(json.dumps(document))
Example #24
Source File: tasks.py From luigi-td with Apache License 2.0 | 5 votes |
def run(self): result = self.run_query(self.query()) with self.output().open('w') as f: result.to_csv(f) ## Building Pipelines
Example #25
Source File: tasks.py From luigi-td with Apache License 2.0 | 5 votes |
def run(self): # retrieve the result and save it as a CSV file with self.output().open('w') as f: self.input().result.to_csv(f)
Example #26
Source File: tasks.py From luigi-td with Apache License 2.0 | 5 votes |
def run(self): with self.input().open() as f: # process the result here print f.read() with self.output().open('w') as f: # crate the final output f.write('done') ## Templating Queries
Example #27
Source File: tasks.py From luigi-td with Apache License 2.0 | 5 votes |
def run(self): with self.output().open('w') as f: # repeat for each ResultTarget for target in self.input(): # output results into a single file for row in target.result: f.write(str(row) + "\n")
Example #28
Source File: luigi_pig.py From HadoopWithPython with MIT License | 5 votes |
def pig_script_path(self): """ The path to the pig script to run """ return self.script_path
Example #29
Source File: gutenberg.py From gluish with GNU General Public License v3.0 | 5 votes |
def run(self): url = "http://gutenberg.readingroo.ms/cache/generated/feeds/catalog.marc.bz2" output = shellout('wget -q "{url}" -O {output}', url=url) output = shellout('bunzip2 {input} -c > {output}', input=output) luigi.LocalTarget(output).move(self.output().path)
Example #30
Source File: gutenberg.py From gluish with GNU General Public License v3.0 | 5 votes |
def run(self): output = shellout('marctotsv -k -s "|" {input} 001 653.a > {output}', input=self.input().get('dump').path) with luigi.LocalTarget(output, format=TSV).open() as handle: with self.output().open('w') as output: for row in handle.iter_tsv(cols=('id', 'terms')): for subfield in row.terms.split('|'): for term in subfield.split('--'): term = term.strip() output.write_tsv(row.id, term)