35
35
logger = logging .getLogger (__name__ )
36
36
37
37
38
+ class NotSupportedException (Exception ):
39
+ pass
40
+
41
+
42
+ def check_detection (config , search_engine_name ):
43
+ """
44
+ Checks whether the search engine specified by search_engine_name
45
+ blocked us.
46
+ """
47
+ status = ''
48
+ chromedriver = config .get ('chromedriver_path' , '/usr/bin/chromedriver' )
49
+
50
+ options = webdriver .ChromeOptions ()
51
+ options .add_argument ('headless' )
52
+ options .add_argument ('window-size=1200x600' )
53
+
54
+ browser = webdriver .Chrome (chrome_options = options , executable_path = chromedriver )
55
+
56
+ if search_engine_name == 'google' :
57
+ url = get_base_search_url_by_search_engine (config , 'google' , 'selenium' )
58
+ browser .get (url )
59
+
60
+ def check (browser , status ):
61
+ needles = SearchEngineScrape .malicious_request_needles ['google' ]
62
+
63
+ if needles ['inurl' ] in browser .current_url and needles ['inhtml' ] in browser .page_source :
64
+ status += 'Google is asking for a captcha! '
65
+ code = 'DETECTED'
66
+ else :
67
+ status += 'No captcha prompt detected. '
68
+ code = 'UNDETECTED'
69
+
70
+ return (code , status )
71
+
72
+ search_input = None
73
+ try :
74
+ search_input = WebDriverWait (browser , 5 ).until (
75
+ EC .visibility_of_element_located ((By .NAME , 'q' )))
76
+ status += 'Got a search input field. '
77
+ except TimeoutException :
78
+ status += 'No search input field located after 5 seconds. '
79
+ return check (browser , status )
80
+
81
+ try :
82
+ # random query
83
+ search_input .send_keys ('President of Finland' + Keys .ENTER )
84
+ status += 'Google Search successful! '
85
+ except WebDriverException :
86
+ status += 'Cannot make a google search! '
87
+ return check (browser , status )
88
+
89
+ return check (browser , status )
90
+
91
+ else :
92
+ raise NotImplementedError ('Detection check only implemented for Google Right now.' )
93
+
94
+ return status
95
+
96
+
38
97
def get_selenium_scraper_by_search_engine_name (config , search_engine_name , * args , ** kwargs ):
39
98
"""Get the appropriate selenium scraper for the given search engine name.
40
99
@@ -142,6 +201,9 @@ def __init__(self, config, *args, captcha_lock=None, browser_num=1, **kwargs):
142
201
self .captcha_lock = captcha_lock
143
202
self .scrape_method = 'selenium'
144
203
204
+ # number of tabs per instance
205
+ self .number_of_tabs = self .config .get ('num_tabs' , 1 )
206
+
145
207
self .xvfb_display = self .config .get ('xvfb_display' , None )
146
208
147
209
self .search_param_values = self ._get_search_param_values ()
@@ -150,6 +212,40 @@ def __init__(self, config, *args, captcha_lock=None, browser_num=1, **kwargs):
150
212
self .base_search_url = get_base_search_url_by_search_engine (self .config , self .search_engine_name , self .scrape_method )
151
213
super ().instance_creation_info (self .__class__ .__name__ )
152
214
215
+
216
+ def switch_to_tab (self , tab_number ):
217
+ """Switch to tab identified by tab_number
218
+
219
+ https://stackoverflow.com/questions/46425797/opening-link-in-the-new-tab-and-switching-between-tabs-selenium-webdriver-pyt
220
+ https://gist.github.com/lrhache/7686903
221
+ """
222
+ assert tab_number < self .number_of_tabs
223
+
224
+ first_link = first_result .find_element_by_tag_name ('a' )
225
+
226
+ # Save the window opener (current window, do not mistaken with tab... not the same)
227
+ main_window = browser .current_window_handle
228
+
229
+ # Open the link in a new tab by sending key strokes on the element
230
+ # Use: Keys.CONTROL + Keys.SHIFT + Keys.RETURN to open tab on top of the stack
231
+ first_link .send_keys (Keys .CONTROL + Keys .RETURN )
232
+
233
+ # Switch tab to the new tab, which we will assume is the next one on the right
234
+ browser .find_element_by_tag_name ('body' ).send_keys (Keys .CONTROL + Keys .TAB )
235
+
236
+ # Put focus on current window which will, in fact, put focus on the current visible tab
237
+ browser .switch_to_window (main_window )
238
+
239
+ # do whatever you have to do on this page, we will just got to sleep for now
240
+ sleep (2 )
241
+
242
+ # Close current tab
243
+ browser .find_element_by_tag_name ('body' ).send_keys (Keys .CONTROL + 'w' )
244
+
245
+ # Put focus on current window which will be the window opener
246
+ browser .switch_to_window (main_window )
247
+
248
+
153
249
def set_proxy (self ):
154
250
"""Install a proxy on the communication channel."""
155
251
@@ -323,7 +419,8 @@ def handle_request_denied(self):
323
419
324
420
super ().handle_request_denied ('400' )
325
421
326
- if self .config .get ('manual_captcha_solving' , False ):
422
+ # only solve when in non headless mode
423
+ if self .config .get ('manual_captcha_solving' , False ) and self .config .get ('browser_mode' ) != 'headless' :
327
424
with self .captcha_lock :
328
425
solution = input ('Please solve the captcha in the browser! Enter any key when done...' )
329
426
try :
0 commit comments