from queue import Queue
import threading
import requests
import datetime
import time
import argparse
import sys
from bs4 import BeautifulSoup


add_lock = threading.Lock()
count_queue = Queue()
search_results = set()

parser = argparse.ArgumentParser(description='FireProx API Google Scraper')
parser.add_argument('--proxy', help='FireProx API URL', type=str, default=None)
parser.add_argument('--search', help='Search term', type=str, default=None)
parser.add_argument('--pages', help='Google search pages to enumerate (default:1000)', type=int, default=1000)
args = parser.parse_args()


def check_query(count, url, query):
	if url[-1] == '/':
		url = url[:-1]

	url = f'{url}/search?q={query}&start={count}&num=100'
	headers = {
		'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:65.0) Gecko/20100101 Firefox/65.0',
	}
	results = requests.get(url, headers=headers)

	soup = BeautifulSoup(results.text, 'lxml')

	with add_lock:
		idx = 1
		for g in soup.find_all('div', class_='r'):
			link = g.find_all('a')[0]['href']
			title = g.find_all('h3')[0]
			item = f'{title.text} ({link})'
			search_results.add(item)
			idx+=1


def process_queue(url, query):
	while True:
		current_count = count_queue.get()
		check_query(current_count, url, query)
		count_queue.task_done()


def main():
	if not any([args.proxy, args.search]):
		parser.print_help()
		sys.exit(1)

	for i in range(100):
		t = threading.Thread(target=process_queue, args=(args.proxy, args.search,))
		t.daemon = True
		t.start()

	start = time.time()

	count_queue.put(0)
	for count in range(1,args.pages+1)[99::100]:
		count_queue.put(count)

	count_queue.join()

	for x in list(search_results):
		print(x)

	print(f'Results: {len(search_results)}')
	print('Execution time: {0:.5f}'.format(time.time() - start))


if __name__ == '__main__':
	main()