Skip to content

Commit 40ab7f1

Browse files
author
Nikolai Tschacher
committed
tests
1 parent 1ed2011 commit 40ab7f1

File tree

11 files changed

+4469
-716
lines changed

11 files changed

+4469
-716
lines changed

GoogleScraper/parsing.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -477,19 +477,19 @@ class YandexParser(Parser):
477477
# @TODO: In december 2015, I saw that yandex only shows the number of search results in the search input field
478478
# with javascript. One can scrape it in plain http mode, but the values are hidden in some javascript and not
479479
# accessible with normal xpath/css selectors. A normal text search is done.
480-
num_results_search_selectors = ['.serp-adv .serp-item__wrap > strong', '.input__found_visibility_visible font font::text']
480+
num_results_search_selectors = ['.serp-list .serp-adv__found::text', '.input__found_visibility_visible font font::text']
481481

482482
page_number_selectors = ['.pager__group .button_checked_yes span::text']
483483

484484
normal_search_selectors = {
485485
'results': {
486486
'de_ip': {
487-
'container': 'div.serp-list',
488-
'result_container': 'div.serp-item',
489-
'link': 'a.serp-item__title-link::attr(href)',
490-
'snippet': 'div.serp-item__text::text',
491-
'title': 'a.serp-item__title-link::text',
492-
'visible_link': 'a.serp-url__link::attr(href)'
487+
'container': '.serp-list',
488+
'result_container': '.serp-item',
489+
'link': 'a.link::attr(href)',
490+
'snippet': 'div.text-container::text',
491+
'title': 'div.organic__url-text::text',
492+
'visible_link': '.typo_type_greenurl::text'
493493
}
494494
}
495495
}

GoogleScraper/scrape_config.py

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -99,17 +99,29 @@
9999
# The google base search url
100100
google_search_url = 'https://www.google.com/search?'
101101

102+
# whether to change the search settings prior to scraping
103+
# when this is set to False google will search with
104+
# the default search settings that your browser supports
105+
google_selenium_search_settings = False
106+
107+
# the following options only take effect when
108+
# google_selenium_search_settings is set to True
109+
102110
# Search Settings for Google Scraping in Selenium Mode
103111
# 10, 20, 30, 50, 100
104-
google_selenium_num_results = 100
112+
google_selenium_num_results = 100
105113
# Private results help find more relevant content for you, including content and connections that only you can see.
106114
google_selenium_personalization = False
107115
# use a country code such as US, DE, GB, CH, ...
108-
google_selenium_region = 'DE'
116+
google_selenium_region = 'DE'
109117
google_selenium_safe_search = False
110118
# the language for google search results
111119
google_selenium_language = 'English'
112120

121+
# manually select search settings
122+
# only possible in visible browsers
123+
google_selenium_manual_settings = False
124+
113125

114126
# The yandex base search url
115127
yandex_search_url = 'http://yandex.ru/yandsearch?'

GoogleScraper/selenium_mode.py

Lines changed: 53 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ class SelScrape(SearchEngineScrape, threading.Thread):
5959

6060
next_page_selectors = {
6161
'google': '#pnnext',
62-
'yandex': '.pager__button_kind_next',
62+
'yandex': '.pager__item_kind_next',
6363
'bing': '.sb_pagN',
6464
'yahoo': '#pg-next',
6565
'baidu': '.n',
@@ -474,7 +474,7 @@ def wait_until_serp_loaded(self):
474474
if self.search_engine_name == 'google':
475475
selector = '#navcnt td.cur'
476476
elif self.search_engine_name == 'yandex':
477-
selector = '.pager__item_current_yes font font'
477+
selector = '.pager__item_current_yes'
478478
elif self.search_engine_name == 'bing':
479479
selector = 'nav li a.sb_pagS'
480480
elif self.search_engine_name == 'yahoo':
@@ -497,7 +497,7 @@ def wait_until_serp_loaded(self):
497497
except TimeoutException as e:
498498
self._save_debug_screenshot()
499499
content = self.webdriver.find_element_by_css_selector(selector).text
500-
raise Exception('Pagenumber={} did not appear in navigation. Got "{}" instead'.format(self.page_number, content))
500+
logger.error('Pagenumber={} did not appear in navigation. Got "{}" instead'.format(self.page_number, content))
501501

502502
elif self.search_type == 'image':
503503
self.wait_until_title_contains_keyword()
@@ -700,63 +700,68 @@ def build_search(self):
700700
This is highly sensitive.
701701
"""
702702
super().build_search()
703-
# assume we are on the normal google search page right now
704-
self.webdriver.get('https://www.google.com/preferences?hl=en')
705703

706-
time.sleep(1)
704+
if self.config.get('google_selenium_search_settings', False):
705+
# assume we are on the normal google search page right now
706+
self.webdriver.get('https://www.google.com/preferences?hl=en')
707707

708-
# wait until we see the settings
709-
element = WebDriverWait(self.webdriver, 7).until(EC.presence_of_element_located((By.NAME, 'safeui')))
708+
time.sleep(1)
710709

711-
try:
712-
if self.config.get('google_selenium_safe_search', False):
713-
if self.webdriver.find_element_by_name('safeui').get_attribute('value') != 'on':
714-
self.webdriver.find_element_by_name('safeui').click()
710+
if self.config.get('google_selenium_manual_settings', False):
711+
return input('Press any Key after search settings completed...')
712+
713+
# wait until we see the settings
714+
element = WebDriverWait(self.webdriver, 7).until(EC.presence_of_element_located((By.NAME, 'safeui')))
715715

716716
try:
717-
if self.config.get('google_selenium_personalization', False):
718-
self.webdriver.find_element_by_css_selector('#pson-radio > div:first-child').click()
719-
else:
720-
self.webdriver.find_element_by_css_selector('#pson-radio > div:nth-child(2)').click()
721-
except WebDriverException as e:
722-
logger.warning('Cannot set personalization settings.')
717+
if self.config.get('google_selenium_safe_search', False):
718+
if self.webdriver.find_element_by_name('safeui').get_attribute('value') != 'on':
719+
self.webdriver.find_element_by_name('safeui').click()
723720

724-
time.sleep(1)
721+
try:
722+
if self.config.get('google_selenium_personalization', False):
723+
self.webdriver.find_element_by_css_selector('#pson-radio > div:first-child').click()
724+
else:
725+
self.webdriver.find_element_by_css_selector('#pson-radio > div:nth-child(2)').click()
726+
except WebDriverException as e:
727+
logger.warning('Cannot set personalization settings.')
725728

726-
# set the region
727-
try:
728-
self.webdriver.find_element_by_id('regionanchormore').click()
729-
except WebDriverException as e:
730-
logger.warning('Regions probably already expanded.')
731-
732-
region = self.config.get('google_selenium_region', 'US')
733-
self.webdriver.find_element_by_css_selector('div[data-value="{}"]'.format(region)).click()
734-
735-
# set the number of results
736-
num_results = self.config.get('google_selenium_num_results', 10)
737-
self.webdriver.find_element_by_id('result_slider').click()
738-
# reset
739-
for i in range(5):
740-
self.webdriver.find_element_by_id('result_slider').send_keys(Keys.LEFT)
741-
# move to desicred result
742-
for i in range((num_results//10)-1):
743-
time.sleep(.25)
744-
self.webdriver.find_element_by_id('result_slider').send_keys(Keys.RIGHT)
729+
time.sleep(1)
745730

746-
time.sleep(1)
731+
# set the region
732+
try:
733+
self.webdriver.find_element_by_id('regionanchormore').click()
734+
except WebDriverException as e:
735+
logger.warning('Regions probably already expanded.')
747736

748-
# save settings
749-
self.webdriver.find_element_by_css_selector('#form-buttons div:first-child').click()
750-
# accept alert
751-
self.webdriver.switch_to.alert.accept()
737+
region = self.config.get('google_selenium_region', 'US')
738+
self.webdriver.find_element_by_css_selector('div[data-value="{}"]'.format(region)).click()
752739

753-
time.sleep(2)
740+
# set the number of results
741+
num_results = self.config.get('google_selenium_num_results', 10)
742+
self.webdriver.find_element_by_id('result_slider').click()
743+
# reset
744+
for i in range(5):
745+
self.webdriver.find_element_by_id('result_slider').send_keys(Keys.LEFT)
746+
# move to desicred result
747+
for i in range((num_results//10)-1):
748+
time.sleep(.25)
749+
self.webdriver.find_element_by_id('result_slider').send_keys(Keys.RIGHT)
754750

755-
self.handle_request_denied()
751+
time.sleep(1)
756752

757-
except WebDriverException as e:
758-
logger.error(e)
759-
raise e
753+
# save settings
754+
self.webdriver.find_element_by_css_selector('#form-buttons div:first-child').click()
755+
# accept alert
756+
self.webdriver.switch_to.alert.accept()
757+
758+
time.sleep(2)
759+
760+
self.handle_request_denied()
761+
762+
except WebDriverException as e:
763+
logger.error(e)
764+
raise e
760765

761766

762767
class DuckduckgoSelScrape(SelScrape):

README.md

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,14 +2,16 @@
22

33
## New News - August 2018
44

5+
For questions you can [contact me on my wegpage](https://incolumitas.com/) and write me an email there.
6+
57
This project is back to live after two years of abandonment. In the coming weeks, I will take some time to update all functionality to the most recent developments. This encompasses updating all Regexes and changes in search engine behavior. After a couple of weeks, you can expect this project to work again as documented here.
68

79
#### Goals
810

911
+ Remove PhantomJS support **[Done]**
1012
+ Maybe remove Firefox support **[Done: Decision was to keep Firefox]**
1113
+ Think about using https://github.com/GoogleChrome/puppeteer instead of selenium for Chrome.
12-
+ Test all supported search engines
14+
+ Test all supported search engines [Partly Done]
1315

1416
### Table of Contents
1517

0 commit comments

Comments
 (0)