Python luigi.run() Examples

The following are 30 code examples of luigi.run(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module luigi , or try the search function .
Example #1
Source File: repdocs.py    From dblp with MIT License 6 votes vote down vote up
def run(self):
        paper_repdocs_file, author_file = self.input()

        with paper_repdocs_file.open() as pfile:
            paper_df = pd.read_csv(pfile, index_col=(0,))
            paper_df.fillna('', inplace=True)

        # read out authorship records
        with author_file.open() as afile:
            author_df = pd.read_csv(afile, header=0, index_col=(0,))

        # initialize repdoc dictionary from complete list of person ids
        author_ids = author_df.index.unique()
        repdocs = {i: [] for i in author_ids}

        # build up repdocs for each author
        for person_id, paper_id in author_df.itertuples():
            doc = paper_df.loc[paper_id]['doc']
            repdocs[person_id].append(doc)

        # save repdocs
        rows = ((person_id, '|'.join(docs))
                for person_id, docs in repdocs.iteritems())
        util.write_csv_to_fwrapper(self.output(), ('author_id', 'doc'), rows) 
Example #2
Source File: filtering.py    From dblp with MIT License 6 votes vote down vote up
def run(self):
        papers_file, refs_file = self.input()
        paper_out, refs_out = self.output()

        with papers_file.open() as pfile:
            papers_df = pd.read_csv(pfile)

        # Filter based on range of years
        papers_df['year'] = papers_df['year'].astype(int)
        filtered = papers_df[(papers_df['year'] >= self.start) &
                             (papers_df['year'] <= self.end)]

        # Save filtered paper records
        with paper_out.open('w') as outfile:
            filtered.to_csv(outfile, index=False)
            paper_ids = filtered['id'].unique()

        # Filter and save references based on paper ids.
        with refs_file.open() as rfile:
            refs_df = pd.read_csv(rfile)

        filtered = refs_df[(refs_df['paper_id'].isin(paper_ids)) &
                           (refs_df['ref_id'].isin(paper_ids))]
        with refs_out.open('w') as outfile:
            filtered.to_csv(outfile, index=False) 
Example #3
Source File: wordcount.py    From HadoopWithPython with MIT License 6 votes vote down vote up
def run(self):
      """
      The task's logic
      """
      count = {}

      ifp = self.input().open('r')

      for line in ifp:
         for word in line.strip().split():
            count[word] = count.get(word, 0) + 1

      ofp = self.output().open('w')
      for k, v in count.items():
            ofp.write('{}\t{}\n'.format(k, v))
      ofp.close() 
Example #4
Source File: convert.py    From dblp with MIT License 6 votes vote down vote up
def run(self):
        corpus = gensim.corpora.MmCorpus(self.input().path)
        with self.output().open('w') as wf:
            docs_with_tf = (
                (docnum, corpus.docbyoffset(offset))
                for docnum, offset in enumerate(corpus.index))
            docs_as_pairs = (
                zip([docnum] * len(doc), [term_id for term_id, _ in doc])
                for docnum, doc in docs_with_tf)
            docs_as_lines = (
                ['%s\t%s' % (docnum, termid) for docnum, termid in pairs]
                for pairs in docs_as_pairs)
            docs = ('\n'.join(lines) for lines in docs_as_lines)

            for doc in docs:
                wf.write('%s\n' % doc) 
Example #5
Source File: build_graphs.py    From dblp with MIT License 6 votes vote down vote up
def run(self):
        graphml_outfile, edgelist_outfile, idmap_outfile = self.output()
        author_graph_file, _ = self.input()

        # Read graph, find LCC, and save as graphml and edgelist
        authorg = igraph.Graph.Read_GraphMLz(author_graph_file.path)
        components = authorg.components()
        lcc = components.giant()
        lcc.write_graphmlz(graphml_outfile.path)
        lcc.write_edgelist(edgelist_outfile.path)

        # Build and save id map.
        idmap = {v['name']: v.index for v in lcc.vs}
        rows = sorted(idmap.items())
        util.write_csv_to_fwrapper(
            idmap_outfile, ('author_id', 'node_id'), rows) 
Example #6
Source File: build_graphs.py    From dblp with MIT License 5 votes vote down vote up
def run(self):
        lcc_pickle_file, venue_map_file = self.input()

        # Read in the LCC graph
        lcc = igraph.Graph.Read_Picklez(lcc_pickle_file.path)

        # Build the community mapping:
        # each venue id is mapped to one or more node ids (the community)
        records = util.iter_csv_fwrapper(venue_map_file)
        communities = {int(venue_id): [] for venue_id, _ in records}
        for v in lcc.vs:
            for venue_id in v['venues']:
                communities[venue_id].append(v.index)

        # retrieve output files
        by_venue_file, by_author_file = self.output()

        # save ground truth communities
        comms = sorted(communities.items())
        rows = (' '.join(map(str, comm)) for comm_num, comm in comms)
        with by_venue_file.open('w') as f:
            f.write('\n'.join(rows))

        # save venue info for each author separately
        records = sorted([(v.index, v['venues']) for v in lcc.vs])
        rows = (' '.join(map(str, venues)) for node_id, venues in records)
        with by_author_file.open('w') as f:
            f.write('\n'.join(rows)) 
Example #7
Source File: filtering.py    From dblp with MIT License 5 votes vote down vote up
def run(self):
        with self.input().open() as paper_file:
            df = pd.read_csv(paper_file)

        multiple = df.groupby('venue')['venue'].transform(len) > 1
        filtered = df[multiple]
        with self.output().open('w') as outfile:
            filtered.to_csv(outfile, index=False) 
Example #8
Source File: filtering.py    From dblp with MIT License 5 votes vote down vote up
def run(self):
        with self.papers_file.open() as pfile:
            paper_df = pd.read_csv(pfile, header=0, usecols=(0,2))
            unique_venues = paper_df['venue'].unique()

        with self.output().open('w') as afile:
            afile.write('\n'.join(unique_venues)) 
Example #9
Source File: filtering.py    From dblp with MIT License 5 votes vote down vote up
def run(self):
        paper_ids = self.read_paper_ids()
        with self.author_file.open() as afile:
            author_df = pd.read_csv(afile)

        # Filter and write authorship records.
        filtered = author_df[author_df['paper_id'].isin(paper_ids)]
        with self.output().open('w') as outfile:
            filtered.to_csv(outfile, index=False) 
Example #10
Source File: build_graphs.py    From dblp with MIT License 5 votes vote down vote up
def run(self):
        refg = igraph.Graph()
        nodes = self.read_paper_vertices()
        refg.add_vertices(nodes)

        # Build and save paper id to node id mapping
        idmap = {str(v['name']): v.index for v in refg.vs}
        rows = sorted(idmap.items())
        util.write_csv_to_fwrapper(
            self.idmap_output_file, ('paper_id', 'node_id'), rows)

        # Now add venues to nodes as paper attributes
        for paper_id, venue in self.read_paper_venues():
            node_id = idmap[paper_id]
            refg.vs[node_id]['venue'] = venue

        # next add author ids
        for v in refg.vs:
            v['author_ids'] = []

        for author_id, paper_id in util.iter_csv_fwrapper(self.author_file):
            node_id = idmap[paper_id]
            refg.vs[node_id]['author_ids'].append(author_id)

        # Finally add edges from citation records
        citation_links = self.read_paper_references(idmap)
        refg.add_edges(citation_links)

        # Save in both pickle and graphml formats
        refg.write_picklez(self.pickle_output_file.path)
        refg.write_graphmlz(self.graphml_output_file.path)
        return refg 
Example #11
Source File: build_graphs.py    From dblp with MIT License 5 votes vote down vote up
def run(self):
        nodes = self.read_author_ids()
        edges = self.get_edges()
        authorg = util.build_undirected_graph(nodes, edges)

        # Now write the graph to gzipped graphml file.
        graph_output_file, idmap_output_file = self.output()
        authorg.write_graphmlz(graph_output_file.path)

        # Finally, build and save the ID map.
        idmap = {v['name']: v.index for v in authorg.vs}
        rows = sorted(idmap.items())
        util.write_csv_to_fwrapper(
            idmap_output_file, ('author_id', 'node_id'), rows) 
Example #12
Source File: build_graphs.py    From dblp with MIT License 5 votes vote down vote up
def run(self):
        graph_file, idmap_file, paper_file, author_file = self.input()

        # Read in dependencies
        lcc = igraph.Graph.Read_GraphMLz(graph_file.path)
        author_venue_df = self.build_linked_venue_frame()
        venue_map = self.assign_venue_ids(author_venue_df)

        records = util.iter_csv_fwrapper(idmap_file)
        lcc_idmap = {record[0]: int(record[1]) for record in records}

        # Use sets in order to ensure uniqueness.
        for v in lcc.vs:
            v['venues'] = set()

        # Add the venue IDs to the node venue sets.
        for rownum, (author_id, venue) in author_venue_df.iterrows():
            node_id = lcc_idmap[str(author_id)]
            venue_id = venue_map[venue]
            lcc.vs[node_id]['venues'].add(venue_id)

        # Convert the sets to tuples.
        for v in lcc.vs:
            v['venues'] = tuple(v['venues'])

        # save a copy of the graph with venue info
        pickle_outfile, venue_map_outfile = self.output()
        lcc.write_picklez(pickle_outfile.path)  # lcc-author-citation-graph

        rows = ((vnum, venue) for venue, vnum in venue_map.iteritems())
        util.write_csv_to_fwrapper(
            venue_map_outfile, ('venue_id', 'venue_name'), rows) 
Example #13
Source File: example1.py    From sciluigi with MIT License 5 votes vote down vote up
def run(self):
        cmd = 'cat ' + self.in_data().path + ' | sed "s/A/T/g" > ' + self.out_replatot().path
        log.info("COMMAND TO EXECUTE: " + cmd)
        call(cmd, shell=True)


# Run this file as script
# ------------------------------------------------------------------------ 
Example #14
Source File: aminer.py    From dblp with MIT License 5 votes vote down vote up
def run(self):
        # find venue column
        with self.input().open() as papers_file:
            reader = csv.reader(papers_file)
            headers = reader.next()
            venue_index = headers.index('venue')

        # filter out unique venues
        with self.input().open() as papers_file:
            df = pd.read_csv(papers_file, header=0, usecols=(venue_index,))
            unique_venues = df['venue'].unique()

        # write to csv file
        with self.output().open('w') as outfile:
            outfile.write('\n'.join(map(str, unique_venues))) 
Example #15
Source File: aminer.py    From dblp with MIT License 5 votes vote down vote up
def run(self):
        # find year column
        with self.input().open() as papers_file:
            reader = csv.reader(papers_file)
            headers = reader.next()
            year_index = headers.index('year')

        # filter out unique years
        with self.input().open() as papers_file:
            df = pd.read_csv(papers_file, header=0, usecols=(year_index,))
            unique_years = df['year'].unique()

        # write to csv file
        with self.output().open('w') as outfile:
            outfile.write('\n'.join(map(str, unique_years))) 
Example #16
Source File: aminer.py    From dblp with MIT License 5 votes vote down vote up
def run(self):
        author_rows = self.read_author_id_name_pairs()
        util.write_csv_to_fwrapper(self.output(), ('id', 'name'), author_rows) 
Example #17
Source File: aminer.py    From dblp with MIT License 5 votes vote down vote up
def run(self):
        authorships = self.iter_authorships()
        util.write_csv_to_fwrapper(
            self.output(), ('author_id', 'paper_id'), authorships) 
Example #18
Source File: interface.py    From sciluigi with MIT License 5 votes vote down vote up
def run(*args, **kwargs):
    '''
    Forwarding luigi's run method
    '''
    luigi.run(*args, **kwargs) 
Example #19
Source File: interface.py    From sciluigi with MIT License 5 votes vote down vote up
def run_local(*args, **kwargs):
    '''
    Forwarding luigi's run method, with local scheduler
    '''
    run(local_scheduler=True, *args, **kwargs) 
Example #20
Source File: interface.py    From sciluigi with MIT License 5 votes vote down vote up
def run(*args, **kwargs):
    '''
    Forwarding luigi's run method
    '''
    luigi.run(*args, **kwargs) 
Example #21
Source File: interface.py    From sciluigi with MIT License 5 votes vote down vote up
def run_local(*args, **kwargs):
    '''
    Forwarding luigi's run method, with local scheduler
    '''
    run(local_scheduler=True, *args, **kwargs) 
Example #22
Source File: tasks.py    From luigi-td with Apache License 2.0 5 votes vote down vote up
def run(self):
        result = self.run_query(self.query())
        print '===================='
        print "Job ID     :", result.job_id
        print "Result size:", result.size
        print "Result     :"
        print "\t".join([c[0] for c in result.description])
        print "----"
        for row in result:
            print "\t".join([str(c) for c in row])
        print '====================' 
Example #23
Source File: newspapers.py    From gluish with GNU General Public License v3.0 5 votes vote down vote up
def run(self):
        """ Construct the document id from the date and the url. """
        document = {}
        document['_id'] = hashlib.sha1('%s:%s' % (
                                       self.date, self.url)).hexdigest()
        with self.input().open() as handle:
            document['content'] = handle.read().decode('utf-8', 'ignore')
        document['url'] = self.url
        document['date'] = unicode(self.date)
        with self.output().open('w') as output:
            output.write(json.dumps(document)) 
Example #24
Source File: tasks.py    From luigi-td with Apache License 2.0 5 votes vote down vote up
def run(self):
        result = self.run_query(self.query())
        with self.output().open('w') as f:
            result.to_csv(f)

## Building Pipelines 
Example #25
Source File: tasks.py    From luigi-td with Apache License 2.0 5 votes vote down vote up
def run(self):
        # retrieve the result and save it as a CSV file
        with self.output().open('w') as f:
            self.input().result.to_csv(f) 
Example #26
Source File: tasks.py    From luigi-td with Apache License 2.0 5 votes vote down vote up
def run(self):
        with self.input().open() as f:
            # process the result here
            print f.read()
        with self.output().open('w') as f:
            # crate the final output
            f.write('done')

## Templating Queries 
Example #27
Source File: tasks.py    From luigi-td with Apache License 2.0 5 votes vote down vote up
def run(self):
        with self.output().open('w') as f:
            # repeat for each ResultTarget
            for target in self.input():
                # output results into a single file
                for row in target.result:
                    f.write(str(row) + "\n") 
Example #28
Source File: luigi_pig.py    From HadoopWithPython with MIT License 5 votes vote down vote up
def pig_script_path(self):
      """
      The path to the pig script to run
      """
      return self.script_path 
Example #29
Source File: gutenberg.py    From gluish with GNU General Public License v3.0 5 votes vote down vote up
def run(self):
        url = "http://gutenberg.readingroo.ms/cache/generated/feeds/catalog.marc.bz2"
        output = shellout('wget -q "{url}" -O {output}', url=url)
        output = shellout('bunzip2 {input} -c > {output}', input=output)
        luigi.LocalTarget(output).move(self.output().path) 
Example #30
Source File: gutenberg.py    From gluish with GNU General Public License v3.0 5 votes vote down vote up
def run(self):
        output = shellout('marctotsv -k -s "|" {input} 001 653.a > {output}',
                 input=self.input().get('dump').path)
        with luigi.LocalTarget(output, format=TSV).open() as handle:
            with self.output().open('w') as output:
                for row in handle.iter_tsv(cols=('id', 'terms')):
                    for subfield in row.terms.split('|'):
                        for term in subfield.split('--'):
                            term = term.strip()
                            output.write_tsv(row.id, term)