added detection check

Nikolai Tschacher · Nikolai Tschacher · commit ebf84927a8d1 · 2018-09-08T23:28:23.000+02:00
diff --git a/.gitignore b/.gitignore
@@ -1,6 +1,7 @@
 # Google Scraper specific
 Drivers/chromedriver
 Drivers/geckodriver
+Private/
 
 # For NodeJS
 
diff --git a/GoogleScraper/commandline.py b/GoogleScraper/commandline.py
@@ -84,6 +84,10 @@ def get_command_line(only_print_help=False):
                              'because GoogleScrape comes shipped with a thoroughly commented configuration file named '
                              '"scrape_config.py"')
 
+    parser.add_argument('--check-detection', type=str, dest='check_detection', action='store',
+                        help='Check if the given search engine blocked you from scrapign. Often detection can be determined'
+                         'if you have to solve a captcha.')
+
     parser.add_argument('--simulate', action='store_true', default=False, required=False,
                         help='''If this flag is set, the scrape job and its estimated length will be printed.''')
 
diff --git a/GoogleScraper/core.py b/GoogleScraper/core.py
@@ -155,7 +155,8 @@ def main(return_results=False, parse_cmd_line=True, config_from_dict=None):
         parse_cmd_line: Whether to get options from the command line or not.
         config_from_dict: Configuration that is passed when GoogleScraper is called as library.
     Returns:
-        A database session to the results when return_results is True. Else, nothing.
+        A database session to the results when return_results is True.
+        A status code can be returned.
     """
     external_config_file_path = cmd_line_args = None
 
@@ -190,6 +191,13 @@ def main(return_results=False, parse_cmd_line=True, config_from_dict=None):
             pass
         return
 
+    search_engine_name = config.get('check_detection', None)
+    if search_engine_name:
+        from GoogleScraper.selenium_mode import check_detection
+        code, status = check_detection(config, search_engine_name)
+        logger.info(status)
+        return code
+
     init_outfile(config, force_reload=True)
 
     kwfile = config.get('keyword_file', '')
diff --git a/GoogleScraper/scrape_config.py b/GoogleScraper/scrape_config.py
@@ -348,6 +348,11 @@
 # will set environment variable $DISPLAY to it
 xvfb_display = None
 
+
+# how many tabs per instance
+num_tabs = 1
+
+
 """
 [HTTP]
 All settings that target the raw http packet scraping mode.
diff --git a/GoogleScraper/selenium_mode.py b/GoogleScraper/selenium_mode.py
@@ -35,6 +35,65 @@
 logger = logging.getLogger(__name__)
 
 
+class NotSupportedException(Exception):
+    pass
+
+
+def check_detection(config, search_engine_name):
+    """
+    Checks whether the search engine specified by search_engine_name 
+    blocked us.
+    """
+    status = ''
+    chromedriver = config.get('chromedriver_path', '/usr/bin/chromedriver')
+
+    options = webdriver.ChromeOptions()
+    options.add_argument('headless')
+    options.add_argument('window-size=1200x600')
+
+    browser = webdriver.Chrome(chrome_options=options, executable_path=chromedriver)
+
+    if search_engine_name == 'google': 
+        url = get_base_search_url_by_search_engine(config, 'google', 'selenium')
+        browser.get(url)
+
+        def check(browser, status):
+            needles = SearchEngineScrape.malicious_request_needles['google']
+
+            if needles['inurl'] in browser.current_url and needles['inhtml'] in browser.page_source:
+                status += 'Google is asking for a captcha! '
+                code = 'DETECTED'
+            else:
+                status += 'No captcha prompt detected. '
+                code = 'UNDETECTED'
+
+            return (code, status)
+
+        search_input = None
+        try:
+            search_input = WebDriverWait(browser, 5).until(
+                EC.visibility_of_element_located((By.NAME, 'q')))
+            status += 'Got a search input field. '
+        except TimeoutException:
+            status += 'No search input field located after 5 seconds. '
+            return check(browser, status)
+
+        try:
+            # random query
+            search_input.send_keys('President of Finland'+ Keys.ENTER)
+            status += 'Google Search successful! '
+        except WebDriverException:
+            status += 'Cannot make a google search! '
+            return check(browser, status)
+
+        return check(browser, status)
+
+    else:
+        raise NotImplementedError('Detection check only implemented for Google Right now.')
+
+    return status
+
+
 def get_selenium_scraper_by_search_engine_name(config, search_engine_name, *args, **kwargs):
     """Get the appropriate selenium scraper for the given search engine name.
 
@@ -142,6 +201,9 @@ def __init__(self, config, *args, captcha_lock=None, browser_num=1, **kwargs):
         self.captcha_lock = captcha_lock
         self.scrape_method = 'selenium'
 
+        # number of tabs per instance
+        self.number_of_tabs = self.config.get('num_tabs', 1)
+
         self.xvfb_display = self.config.get('xvfb_display', None)
 
         self.search_param_values = self._get_search_param_values()
@@ -150,6 +212,40 @@ def __init__(self, config, *args, captcha_lock=None, browser_num=1, **kwargs):
         self.base_search_url = get_base_search_url_by_search_engine(self.config, self.search_engine_name, self.scrape_method)
         super().instance_creation_info(self.__class__.__name__)
 
+
+    def switch_to_tab(self, tab_number):
+        """Switch to tab identified by tab_number
+
+        https://stackoverflow.com/questions/46425797/opening-link-in-the-new-tab-and-switching-between-tabs-selenium-webdriver-pyt
+        https://gist.github.com/lrhache/7686903
+        """
+        assert tab_number < self.number_of_tabs
+
+        first_link = first_result.find_element_by_tag_name('a')
+
+        # Save the window opener (current window, do not mistaken with tab... not the same)
+        main_window = browser.current_window_handle
+
+        # Open the link in a new tab by sending key strokes on the element
+        # Use: Keys.CONTROL + Keys.SHIFT + Keys.RETURN to open tab on top of the stack 
+        first_link.send_keys(Keys.CONTROL + Keys.RETURN)
+
+        # Switch tab to the new tab, which we will assume is the next one on the right
+        browser.find_element_by_tag_name('body').send_keys(Keys.CONTROL + Keys.TAB)
+            
+        # Put focus on current window which will, in fact, put focus on the current visible tab
+        browser.switch_to_window(main_window)
+
+        # do whatever you have to do on this page, we will just got to sleep for now
+        sleep(2)
+
+        # Close current tab
+        browser.find_element_by_tag_name('body').send_keys(Keys.CONTROL + 'w')
+
+        # Put focus on current window which will be the window opener
+        browser.switch_to_window(main_window)
+
+
     def set_proxy(self):
         """Install a proxy on the communication channel."""
 
@@ -323,7 +419,8 @@ def handle_request_denied(self):
 
             super().handle_request_denied('400')
 
-            if self.config.get('manual_captcha_solving', False):
+            # only solve when in non headless mode
+            if self.config.get('manual_captcha_solving', False) and self.config.get('browser_mode') != 'headless':
                 with self.captcha_lock:
                     solution = input('Please solve the captcha in the browser! Enter any key when done...')
                     try:
diff --git a/TODO.md b/TODO.md
@@ -278,10 +278,15 @@ https://duo.com/decipher/driving-headless-chrome-with-python
 ### 08.09.2018
  
 + Parce Google Map Recommondations [DONE]
-+ Minimize resouce consumation in selenium scraping with chrome
++ Minimize resouce consumation in selenium scraping with chrome [DONE]
 	- https://stackoverflow.com/questions/49008008/chrome-headless-puppeteer-too-much-cpu
 	- https://news.ycombinator.com/item?id=14103503
 	- https://news.ycombinator.com/item?id=14103503
-	- Added
-	```
-	``` 
+	- Its better to use tabs than new instances: https://github.com/GoogleChrome/puppeteer/issues/1569
+
++ Write code that manages X tabs within Y browser instances to yield X*Y scraper instances
+	+ https://blog.phantombuster.com/web-scraping-in-2017-headless-chrome-tips-tricks-4d6521d695e8?gi=7edcb5e70c66
++ Very good article about web scraping in general: https://blog.phantombuster.com/web-scraping-in-2017-headless-chrome-tips-tricks-4d6521d695e8
+
+
++ Add detection check