python source code of table

#!/usr/bin/python2.7
# All rights to this package are hereby disclaimed and its contents
# released into the public domain by the authors.

'''Helper class to read BigQuery tables and metadata.

This module contains code to read BigQuery tables using the
TableData.list() API and save the results to a file.
Usage:
  table_reader = TableReader(project_id=<project_id>,
                              dataset_id=<dataset_id>,
                              table_id=<table_id>)
  with FileResultHandler(<output_file_name>) as result_handler:
    table_reader.read(result_handler)
will read the BigQuery table <project_id>:<dataset_id>.<table_id>
and download the results to the file <output_file_name>.
'''

import json
import os
import sys
import threading
import time

# Imports from the Google API client:
from apiclient.errors import HttpError

# Imports from files in this directory:
import auth

READ_CHUNK_SIZE= 64 * 1024

class ResultHandler:
  '''Abstract class to handle reading TableData rows.'''

  def handle_rows(self, rows):
    '''Process one page of results.'''
    pass

class TableReader:
  '''Reads data from a BigQuery table.'''

  def __init__(self, project_id, dataset_id, table_id,
      start_index=None, read_count=None, next_page_token=None):
    self.project_id = project_id
    self.dataset_id = dataset_id
    self.bq_service = auth.build_bq_client()
    self.next_page_token = next_page_token
    self.next_index = start_index
    self.rows_left = read_count
    self.table_id = table_id

  def get_table_info(self):
    '''Returns a tuple of (modified time, row count) for the table.'''

    table = self.bq_service.tables().get(
        projectId=self.project_id,
        datasetId=self.dataset_id,
        tableId=self.table_id).execute()
    last_modified = int(table.get('lastModifiedTime', 0))
    row_count = int(table.get('numRows', 0))
    print '%s last modified at %d' % (
      table['id'], last_modified)
    return (last_modified, row_count)

  def advance(self, rows, page_token):
    '''Called after reading a page, advances current indices.'''

    done = page_token is None
    if self.rows_left is not None:
      self.rows_left -= len(rows)
      if self.rows_left < 0: print 'Error: Read too many rows!'
      if self.rows_left <= 0: done = True

    if self.next_index is not None:
      self.next_index += len(rows)
    else:
      # Only use page tokens when we're not using
      # index-based pagination.
      self.next_page_token = page_token
    return done

  def get_table_id(self):
    if '@' in self.table_id and self.snapshot_time is not None:
      raise Exception("Table already has a snapshot time")
    if self.snapshot_time is None:
      return self.table_id
    else:
      return '%s@%d' % (self.table_id, self.snapshot_time)

  def make_read_message(self, row_count, max_results):
    '''Creates a status message for the current read operation.'''
    read_msg =  'Read %d rows' % (row_count,)
    if self.next_index is not None:
      read_msg = '%s at %s' % (read_msg, self.next_index)
    elif self.next_page_token is not None:
      read_msg = '%s at %s' % (read_msg, self.next_page_token)
    else:
      read_msg = '%s from start' % (read_msg,)
    if max_results <> row_count:
       read_msg = '%s [max %d]' % (read_msg, max_results)
    return read_msg

  def read_one_page(self, max_results=READ_CHUNK_SIZE):
    '''Reads one page from the table.'''

    while True:
      try:
        if self.rows_left is not None and self.rows_left < max_results:
          max_results = self.rows_left

        data = self.bq_service.tabledata().list(
            projectId=self.project_id,
            datasetId=self.dataset_id,
            tableId=self.get_table_id(),
            startIndex=self.next_index,
            pageToken=self.next_page_token,
            maxResults=max_results).execute()
        next_page_token = data.get('pageToken', None)
        rows = data.get('rows', [])
        print self.make_read_message(len(rows), max_results)
        is_done = self.advance(rows, next_page_token)
        return (is_done, rows)
      except HttpError, err:
        # If the error is a rate limit or connection error, wait and
        # try again.
        if err.resp.status in [403, 500, 503]:
          print '%s: Retryable error %s, waiting' % (
              self.thread_id, err.resp.status,)
          time.sleep(5)
        else: raise

  def read(self, result_handler, snapshot_time=None):
    '''Reads an entire table until the end or we hit a row limit.'''
    # Read the current time and use that for the snapshot time.
    # This will prevent us from getting inconsistent results when the
    # underlying table is changing.
    if snapshot_time is None and not '@' in self.table_id:
      self.snapshot_time = int(time.time() * 1000)
    self.snapshot_time = snapshot_time
    while True:
      is_done, rows = self.read_one_page()
      if rows:
        result_handler.handle_rows(rows)
      if is_done:
        return


class FileResultHandler(ResultHandler):
  '''Result handler that saves rows to a file.'''

  def __init__(self, output_file_name):
    self.output_file_name = output_file_name
    print 'Writing results to %s' % (output_file_name,)

  def __enter__(self):
    self.make_output_dir()
    self.output_file =  open(self.output_file_name, 'w')
    return self

  def __exit__(self, type, value, traceback):
    if self.output_file:
      self.output_file.close()
      self.output_file = None

  def make_output_dir(self):
    '''Creates an output directory for the downloaded results.'''

    output_dir = os.path.dirname(self.output_file_name)
    if os.path.exists(output_dir) and os.path.isdir(output_dir):
      # Nothing to do.
      return
    os.makedirs(output_dir)

  def handle_rows(self, rows):
    if self.output_file is None:
      self.__enter()
    self.output_file.write(json.dumps(rows, indent=2))


class TableReadThread (threading.Thread):
  '''Thread that reads from a table and writes it to a file.'''
  def __init__(self, table_reader, output_file_name,
               thread_id='thread'):
    threading.Thread.__init__(self)
    self.table_reader = table_reader
    self.output_file_name = output_file_name
    self.thread_id = thread_id

  def run(self):
    print 'Reading %s' % (self.thread_id,)
    with FileResultHandler(self.output_file_name) as result_handler:
      self.table_reader.read(result_handler)


def main(argv):
  if len(argv) == 0:
     argv = ['publicdata',
             'samples',
             'shakespeare',
             '/tmp/bigquery']
  if len(argv) < 4:
    # Wrong number of args, print the usage and quit.
    arg_names = [sys.argv[0],
                 '<project_id>',
                 '<dataset_id>',
                 '<table_id>',
                 '<output_directory>']
    print 'Usage: %s' % (' '.join(arg_names))
    print 'Got: %s' % (argv,)
    return

  table_reader = TableReader(project_id=argv[0],
                             dataset_id=argv[1],
                             table_id=argv[2])
  # Come up with a suitably obtuse file name.
  output_file_name = os.path.join(
      argv[3],
      table_reader.project_id,
      table_reader.dataset_id,
      table_reader.table_id)
  thread = TableReadThread(table_reader, output_file_name)
  thread.start()
  thread.join()

if __name__ == "__main__":
    main(sys.argv[1:])