Skip to content

Commit ebf8492

Browse files
author
Nikolai Tschacher
committed
added detection check
1 parent 8300ba8 commit ebf8492

File tree

6 files changed

+126
-6
lines changed

6 files changed

+126
-6
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
# Google Scraper specific
22
Drivers/chromedriver
33
Drivers/geckodriver
4+
Private/
45

56
# For NodeJS
67

GoogleScraper/commandline.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,10 @@ def get_command_line(only_print_help=False):
8484
'because GoogleScrape comes shipped with a thoroughly commented configuration file named '
8585
'"scrape_config.py"')
8686

87+
parser.add_argument('--check-detection', type=str, dest='check_detection', action='store',
88+
help='Check if the given search engine blocked you from scrapign. Often detection can be determined'
89+
'if you have to solve a captcha.')
90+
8791
parser.add_argument('--simulate', action='store_true', default=False, required=False,
8892
help='''If this flag is set, the scrape job and its estimated length will be printed.''')
8993

GoogleScraper/core.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -155,7 +155,8 @@ def main(return_results=False, parse_cmd_line=True, config_from_dict=None):
155155
parse_cmd_line: Whether to get options from the command line or not.
156156
config_from_dict: Configuration that is passed when GoogleScraper is called as library.
157157
Returns:
158-
A database session to the results when return_results is True. Else, nothing.
158+
A database session to the results when return_results is True.
159+
A status code can be returned.
159160
"""
160161
external_config_file_path = cmd_line_args = None
161162

@@ -190,6 +191,13 @@ def main(return_results=False, parse_cmd_line=True, config_from_dict=None):
190191
pass
191192
return
192193

194+
search_engine_name = config.get('check_detection', None)
195+
if search_engine_name:
196+
from GoogleScraper.selenium_mode import check_detection
197+
code, status = check_detection(config, search_engine_name)
198+
logger.info(status)
199+
return code
200+
193201
init_outfile(config, force_reload=True)
194202

195203
kwfile = config.get('keyword_file', '')

GoogleScraper/scrape_config.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -348,6 +348,11 @@
348348
# will set environment variable $DISPLAY to it
349349
xvfb_display = None
350350

351+
352+
# how many tabs per instance
353+
num_tabs = 1
354+
355+
351356
"""
352357
[HTTP]
353358
All settings that target the raw http packet scraping mode.

GoogleScraper/selenium_mode.py

Lines changed: 98 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,65 @@
3535
logger = logging.getLogger(__name__)
3636

3737

38+
class NotSupportedException(Exception):
39+
pass
40+
41+
42+
def check_detection(config, search_engine_name):
43+
"""
44+
Checks whether the search engine specified by search_engine_name
45+
blocked us.
46+
"""
47+
status = ''
48+
chromedriver = config.get('chromedriver_path', '/usr/bin/chromedriver')
49+
50+
options = webdriver.ChromeOptions()
51+
options.add_argument('headless')
52+
options.add_argument('window-size=1200x600')
53+
54+
browser = webdriver.Chrome(chrome_options=options, executable_path=chromedriver)
55+
56+
if search_engine_name == 'google':
57+
url = get_base_search_url_by_search_engine(config, 'google', 'selenium')
58+
browser.get(url)
59+
60+
def check(browser, status):
61+
needles = SearchEngineScrape.malicious_request_needles['google']
62+
63+
if needles['inurl'] in browser.current_url and needles['inhtml'] in browser.page_source:
64+
status += 'Google is asking for a captcha! '
65+
code = 'DETECTED'
66+
else:
67+
status += 'No captcha prompt detected. '
68+
code = 'UNDETECTED'
69+
70+
return (code, status)
71+
72+
search_input = None
73+
try:
74+
search_input = WebDriverWait(browser, 5).until(
75+
EC.visibility_of_element_located((By.NAME, 'q')))
76+
status += 'Got a search input field. '
77+
except TimeoutException:
78+
status += 'No search input field located after 5 seconds. '
79+
return check(browser, status)
80+
81+
try:
82+
# random query
83+
search_input.send_keys('President of Finland'+ Keys.ENTER)
84+
status += 'Google Search successful! '
85+
except WebDriverException:
86+
status += 'Cannot make a google search! '
87+
return check(browser, status)
88+
89+
return check(browser, status)
90+
91+
else:
92+
raise NotImplementedError('Detection check only implemented for Google Right now.')
93+
94+
return status
95+
96+
3897
def get_selenium_scraper_by_search_engine_name(config, search_engine_name, *args, **kwargs):
3998
"""Get the appropriate selenium scraper for the given search engine name.
4099
@@ -142,6 +201,9 @@ def __init__(self, config, *args, captcha_lock=None, browser_num=1, **kwargs):
142201
self.captcha_lock = captcha_lock
143202
self.scrape_method = 'selenium'
144203

204+
# number of tabs per instance
205+
self.number_of_tabs = self.config.get('num_tabs', 1)
206+
145207
self.xvfb_display = self.config.get('xvfb_display', None)
146208

147209
self.search_param_values = self._get_search_param_values()
@@ -150,6 +212,40 @@ def __init__(self, config, *args, captcha_lock=None, browser_num=1, **kwargs):
150212
self.base_search_url = get_base_search_url_by_search_engine(self.config, self.search_engine_name, self.scrape_method)
151213
super().instance_creation_info(self.__class__.__name__)
152214

215+
216+
def switch_to_tab(self, tab_number):
217+
"""Switch to tab identified by tab_number
218+
219+
https://stackoverflow.com/questions/46425797/opening-link-in-the-new-tab-and-switching-between-tabs-selenium-webdriver-pyt
220+
https://gist.github.com/lrhache/7686903
221+
"""
222+
assert tab_number < self.number_of_tabs
223+
224+
first_link = first_result.find_element_by_tag_name('a')
225+
226+
# Save the window opener (current window, do not mistaken with tab... not the same)
227+
main_window = browser.current_window_handle
228+
229+
# Open the link in a new tab by sending key strokes on the element
230+
# Use: Keys.CONTROL + Keys.SHIFT + Keys.RETURN to open tab on top of the stack
231+
first_link.send_keys(Keys.CONTROL + Keys.RETURN)
232+
233+
# Switch tab to the new tab, which we will assume is the next one on the right
234+
browser.find_element_by_tag_name('body').send_keys(Keys.CONTROL + Keys.TAB)
235+
236+
# Put focus on current window which will, in fact, put focus on the current visible tab
237+
browser.switch_to_window(main_window)
238+
239+
# do whatever you have to do on this page, we will just got to sleep for now
240+
sleep(2)
241+
242+
# Close current tab
243+
browser.find_element_by_tag_name('body').send_keys(Keys.CONTROL + 'w')
244+
245+
# Put focus on current window which will be the window opener
246+
browser.switch_to_window(main_window)
247+
248+
153249
def set_proxy(self):
154250
"""Install a proxy on the communication channel."""
155251

@@ -323,7 +419,8 @@ def handle_request_denied(self):
323419

324420
super().handle_request_denied('400')
325421

326-
if self.config.get('manual_captcha_solving', False):
422+
# only solve when in non headless mode
423+
if self.config.get('manual_captcha_solving', False) and self.config.get('browser_mode') != 'headless':
327424
with self.captcha_lock:
328425
solution = input('Please solve the captcha in the browser! Enter any key when done...')
329426
try:

TODO.md

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -278,10 +278,15 @@ https://duo.com/decipher/driving-headless-chrome-with-python
278278
### 08.09.2018
279279

280280
+ Parce Google Map Recommondations [DONE]
281-
+ Minimize resouce consumation in selenium scraping with chrome
281+
+ Minimize resouce consumation in selenium scraping with chrome [DONE]
282282
- https://stackoverflow.com/questions/49008008/chrome-headless-puppeteer-too-much-cpu
283283
- https://news.ycombinator.com/item?id=14103503
284284
- https://news.ycombinator.com/item?id=14103503
285-
- Added
286-
```
287-
```
285+
- Its better to use tabs than new instances: https://github.com/GoogleChrome/puppeteer/issues/1569
286+
287+
+ Write code that manages X tabs within Y browser instances to yield X*Y scraper instances
288+
+ https://blog.phantombuster.com/web-scraping-in-2017-headless-chrome-tips-tricks-4d6521d695e8?gi=7edcb5e70c66
289+
+ Very good article about web scraping in general: https://blog.phantombuster.com/web-scraping-in-2017-headless-chrome-tips-tricks-4d6521d695e8
290+
291+
292+
+ Add detection check

0 commit comments

Comments
 (0)