2020-05-24 23:03:11 +03:00
|
|
|
from lxml import etree
|
2020-05-12 09:45:56 +03:00
|
|
|
import random
|
2020-06-11 22:38:51 +03:00
|
|
|
import requests
|
|
|
|
from requests import Response
|
2020-04-24 05:59:43 +03:00
|
|
|
import urllib.parse as urlparse
|
|
|
|
|
2020-05-24 23:03:11 +03:00
|
|
|
# Core Google search URLs
|
2020-04-24 05:59:43 +03:00
|
|
|
SEARCH_URL = 'https://www.google.com/search?gbv=1&q='
|
2020-05-24 23:03:11 +03:00
|
|
|
AUTOCOMPLETE_URL = 'https://suggestqueries.google.com/complete/search?client=toolbar&'
|
2020-04-24 05:59:43 +03:00
|
|
|
|
|
|
|
MOBILE_UA = '{}/5.0 (Android 0; Mobile; rv:54.0) Gecko/54.0 {}/59.0'
|
|
|
|
DESKTOP_UA = '{}/5.0 (X11; {} x86_64; rv:75.0) Gecko/20100101 {}/75.0'
|
|
|
|
|
2020-04-29 03:19:34 +03:00
|
|
|
# Valid query params
|
2020-06-11 22:38:51 +03:00
|
|
|
VALID_PARAMS = ['tbs', 'tbm', 'start', 'near', 'source']
|
2020-04-29 03:19:34 +03:00
|
|
|
|
2020-04-24 05:59:43 +03:00
|
|
|
|
2020-06-11 22:38:51 +03:00
|
|
|
def gen_user_agent(is_mobile):
|
2020-05-12 09:45:56 +03:00
|
|
|
mozilla = random.choice(['Moo', 'Woah', 'Bro', 'Slow']) + 'zilla'
|
|
|
|
firefox = random.choice(['Choir', 'Squier', 'Higher', 'Wire']) + 'fox'
|
|
|
|
linux = random.choice(['Win', 'Sin', 'Gin', 'Fin', 'Kin']) + 'ux'
|
2020-04-24 05:59:43 +03:00
|
|
|
|
|
|
|
if is_mobile:
|
|
|
|
return MOBILE_UA.format(mozilla, firefox)
|
2020-06-11 22:38:51 +03:00
|
|
|
|
|
|
|
return DESKTOP_UA.format(mozilla, linux, firefox)
|
2020-04-24 05:59:43 +03:00
|
|
|
|
|
|
|
|
2020-05-23 23:27:23 +03:00
|
|
|
def gen_query(query, args, config, near_city=None):
|
2020-04-30 03:53:58 +03:00
|
|
|
param_dict = {key: '' for key in VALID_PARAMS}
|
2020-06-11 22:38:51 +03:00
|
|
|
|
2020-04-24 05:59:43 +03:00
|
|
|
# Use :past(hour/day/week/month/year) if available
|
|
|
|
# example search "new restaurants :past month"
|
2020-06-11 22:38:51 +03:00
|
|
|
sub_lang = ''
|
|
|
|
if ':past' in query and 'tbs' not in args:
|
2020-04-29 03:59:33 +03:00
|
|
|
time_range = str.strip(query.split(':past', 1)[-1])
|
2020-06-11 22:38:51 +03:00
|
|
|
param_dict['tbs'] = '&tbs=' + ('qdr:' + str.lower(time_range[0]))
|
|
|
|
elif 'tbs' in args:
|
|
|
|
result_tbs = args.get('tbs')
|
|
|
|
param_dict['tbs'] = '&tbs=' + result_tbs
|
|
|
|
|
|
|
|
# Occasionally the 'tbs' param provided by google also contains a field for 'lr', but formatted
|
|
|
|
# strangely. This is a (admittedly not very elegant) solution for this.
|
|
|
|
# Ex/ &tbs=qdr:h,lr:lang_1pl --> the lr param needs to be extracted and have the "1" digit removed in this case
|
|
|
|
sub_lang = [_ for _ in result_tbs.split(',') if 'lr:' in _]
|
|
|
|
sub_lang = sub_lang[0][sub_lang[0].find('lr:') + 3:len(sub_lang[0])] if len(sub_lang) > 0 else ''
|
2020-04-24 05:59:43 +03:00
|
|
|
|
|
|
|
# Ensure search query is parsable
|
2020-04-29 03:59:33 +03:00
|
|
|
query = urlparse.quote(query)
|
2020-04-24 05:59:43 +03:00
|
|
|
|
|
|
|
# Pass along type of results (news, images, books, etc)
|
|
|
|
if 'tbm' in args:
|
2020-04-30 03:53:58 +03:00
|
|
|
param_dict['tbm'] = '&tbm=' + args.get('tbm')
|
2020-04-24 05:59:43 +03:00
|
|
|
|
|
|
|
# Get results page start value (10 per page, ie page 2 start val = 20)
|
|
|
|
if 'start' in args:
|
2020-04-30 03:53:58 +03:00
|
|
|
param_dict['start'] = '&start=' + args.get('start')
|
2020-04-24 05:59:43 +03:00
|
|
|
|
|
|
|
# Search for results near a particular city, if available
|
2020-05-23 23:27:23 +03:00
|
|
|
if near_city:
|
2020-04-30 03:53:58 +03:00
|
|
|
param_dict['near'] = '&near=' + urlparse.quote(near_city)
|
2020-04-24 05:59:43 +03:00
|
|
|
|
2020-06-11 22:38:51 +03:00
|
|
|
# Set language for results (lr) if source isn't set, otherwise use the result
|
|
|
|
# language param provided by google (but with the strange digit(s) removed)
|
|
|
|
if 'source' in args:
|
|
|
|
param_dict['source'] = '&source=' + args.get('source')
|
|
|
|
param_dict['lr'] = ('&lr=' + ''.join([_ for _ in sub_lang if not _.isdigit()])) if sub_lang else ''
|
|
|
|
else:
|
|
|
|
param_dict['lr'] = '&lr=' + config.lang
|
|
|
|
|
2020-05-23 23:27:23 +03:00
|
|
|
param_dict['cr'] = ('&cr=' + config.ctry) if config.ctry else ''
|
2020-06-11 22:38:51 +03:00
|
|
|
param_dict['hl'] = '&hl=' + config.lang.replace('lang_', '')
|
2020-05-23 23:27:23 +03:00
|
|
|
param_dict['safe'] = '&safe=' + ('active' if config.safe else 'off')
|
2020-05-13 02:15:53 +03:00
|
|
|
|
2020-04-30 03:53:58 +03:00
|
|
|
for val in param_dict.values():
|
2020-06-11 22:38:51 +03:00
|
|
|
if not val:
|
2020-04-29 03:59:33 +03:00
|
|
|
continue
|
|
|
|
query += val
|
|
|
|
|
|
|
|
return query
|
2020-04-24 05:59:43 +03:00
|
|
|
|
|
|
|
|
|
|
|
class Request:
|
2020-05-13 02:15:53 +03:00
|
|
|
def __init__(self, normal_ua, language='lang_en'):
|
|
|
|
self.language = language
|
2020-05-24 23:03:11 +03:00
|
|
|
self.mobile = 'Android' in normal_ua or 'iPhone' in normal_ua
|
2020-06-11 22:38:51 +03:00
|
|
|
self.modified_user_agent = gen_user_agent(self.mobile)
|
2020-04-24 05:59:43 +03:00
|
|
|
|
|
|
|
def __getitem__(self, name):
|
|
|
|
return getattr(self, name)
|
|
|
|
|
2020-05-24 23:03:11 +03:00
|
|
|
def autocomplete(self, query):
|
|
|
|
ac_query = dict(hl=self.language, q=query)
|
2020-06-11 22:38:51 +03:00
|
|
|
response = self.send(base_url=AUTOCOMPLETE_URL, query=urlparse.urlencode(ac_query)).text
|
2020-05-24 23:03:11 +03:00
|
|
|
|
|
|
|
if response:
|
|
|
|
dom = etree.fromstring(response)
|
|
|
|
return dom.xpath('//suggestion/@data')
|
|
|
|
|
|
|
|
return []
|
|
|
|
|
2020-06-11 22:38:51 +03:00
|
|
|
def send(self, base_url=SEARCH_URL, query='') -> Response:
|
|
|
|
headers = {
|
|
|
|
'User-Agent': self.modified_user_agent
|
|
|
|
}
|
|
|
|
|
|
|
|
return requests.get(base_url + query, headers=headers)
|