12
12
13
13
# Custom Python libraries.
14
14
15
- __version__ = "1.2 .0"
15
+ __version__ = "1.4 .0"
16
16
17
17
# Logging
18
18
ROOT_LOGGER = logging .getLogger ("yagooglesearch" )
@@ -77,8 +77,9 @@ def __init__(
77
77
country = "" ,
78
78
extra_params = None ,
79
79
max_search_result_urls_to_return = 100 ,
80
- delay_between_paged_results_in_seconds = list ( range ( 7 , 18 )) ,
80
+ minimum_delay_between_paged_results_in_seconds = 7 ,
81
81
user_agent = None ,
82
+ yagooglesearch_manages_http_429s = True ,
82
83
http_429_cool_off_time_in_minutes = 60 ,
83
84
http_429_cool_off_factor = 1.1 ,
84
85
proxy = "" ,
@@ -88,28 +89,31 @@ def __init__(
88
89
89
90
"""
90
91
SearchClient
91
- :param str query: Query string. Must NOT be url-encoded.
92
+ :param str query: Query string. Must NOT be url-encoded.
92
93
:param str tld: Top level domain.
93
94
:param str lang: Language.
94
95
:param str tbs: Verbatim search or time limits (e.g., "qdr:h" => last hour, "qdr:d" => last 24 hours, "qdr:m"
95
96
=> last month).
96
97
:param str safe: Safe search.
97
98
:param int start: First page of results to retrieve.
98
99
:param int num: Max number of results to pull back per page. Capped at 100 by Google.
99
- :param str country: Country or region to focus the search on. Similar to changing the TLD, but does not yield
100
+ :param str country: Country or region to focus the search on. Similar to changing the TLD, but does not yield
100
101
exactly the same results. Only Google knows why...
101
- :param dict extra_params: A dictionary of extra HTTP GET parameters, which must be URL encoded. For example if
102
+ :param dict extra_params: A dictionary of extra HTTP GET parameters, which must be URL encoded. For example if
102
103
you don't want Google to filter similar results you can set the extra_params to {'filter': '0'} which will
103
104
append '&filter=0' to every query.
104
105
:param int max_search_result_urls_to_return: Max URLs to return for the entire Google search.
105
- :param int delay_between_paged_results_in_seconds: Time to wait between HTTP requests for consecutive pages for
106
- the same search query.
106
+ :param int minimum_delay_between_paged_results_in_seconds: Minimum time to wait between HTTP requests for
107
+ consecutive pages for the same search query. The actual time will be a random value between this minimum
108
+ value and value + 11 to make it look more human.
107
109
:param str user_agent: Hard-coded user agent for the HTTP requests.
110
+ :param bool yagooglesearch_manages_http_429s: Determines if yagooglesearch will handle HTTP 429 cool off and
111
+ retries. Disable if you want to manage HTTP 429 responses.
108
112
:param int http_429_cool_off_time_in_minutes: Minutes to sleep if an HTTP 429 is detected.
109
113
:param float http_429_cool_off_factor: Factor to multiply by http_429_cool_off_time_in_minutes for each HTTP 429
110
114
detected.
111
115
:param str proxy: HTTP(S) or SOCKS5 proxy to use.
112
- :param bool verify_ssl: Verify the SSL certificate to prevent traffic interception attacks. Defaults to True.
116
+ :param bool verify_ssl: Verify the SSL certificate to prevent traffic interception attacks. Defaults to True.
113
117
This may need to be disabled in some HTTPS proxy instances.
114
118
:param int verbosity: Logging and console output verbosity.
115
119
@@ -127,8 +131,9 @@ def __init__(
127
131
self .country = country
128
132
self .extra_params = extra_params
129
133
self .max_search_result_urls_to_return = max_search_result_urls_to_return
130
- self .delay_between_paged_results_in_seconds = delay_between_paged_results_in_seconds
134
+ self .minimum_delay_between_paged_results_in_seconds = minimum_delay_between_paged_results_in_seconds
131
135
self .user_agent = user_agent
136
+ self .yagooglesearch_manages_http_429s = yagooglesearch_manages_http_429s
132
137
self .http_429_cool_off_time_in_minutes = http_429_cool_off_time_in_minutes
133
138
self .http_429_cool_off_factor = http_429_cool_off_factor
134
139
self .proxy = proxy
@@ -362,14 +367,24 @@ def get_page(self, url):
362
367
363
368
if http_response_code == 200 :
364
369
html = response .text
370
+
365
371
elif http_response_code == 429 :
372
+
366
373
ROOT_LOGGER .warning ("Google is blocking your IP for making too many requests in a specific time period." )
374
+
375
+ # Calling script does not want yagooglesearch to handle HTTP 429 cool off and retry. Just return a
376
+ # notification string.
377
+ if not self .yagooglesearch_manages_http_429s :
378
+ ROOT_LOGGER .info ("Since yagooglesearch_manages_http_429s=False, yagooglesearch is done." )
379
+ return "HTTP_429_DETECTED"
380
+
367
381
ROOT_LOGGER .info (f"Sleeping for { self .http_429_cool_off_time_in_minutes } minutes..." )
368
382
time .sleep (self .http_429_cool_off_time_in_minutes * 60 )
369
383
self .http_429_detected ()
370
384
371
385
# Try making the request again.
372
386
html = self .get_page (url )
387
+
373
388
else :
374
389
ROOT_LOGGER .warning (f"HTML response code: { http_response_code } " )
375
390
@@ -432,6 +447,13 @@ def search(self):
432
447
# Request Google search results.
433
448
html = self .get_page (url )
434
449
450
+ # HTTP 429 message returned from get_page() function, add "HTTP_429_DETECTED" to the set and return to the
451
+ # calling script.
452
+ if html == "HTTP_429_DETECTED" :
453
+ unique_urls_set .add ("HTTP_429_DETECTED" )
454
+ self .unique_urls_list = list (unique_urls_set )
455
+ return self .unique_urls_list
456
+
435
457
# Create the BeautifulSoup object.
436
458
soup = BeautifulSoup (html , "html.parser" )
437
459
@@ -509,6 +531,11 @@ def search(self):
509
531
url = self .url_next_page_num
510
532
511
533
# Randomize sleep time between paged requests to make it look more human.
512
- random_sleep_time = random .choice (self .delay_between_paged_results_in_seconds )
534
+ random_sleep_time = random .choice (
535
+ range (
536
+ self .minimum_delay_between_paged_results_in_seconds ,
537
+ self .minimum_delay_between_paged_results_in_seconds + 11 ,
538
+ )
539
+ )
513
540
ROOT_LOGGER .info (f"Sleeping { random_sleep_time } seconds until retrieving the next page of results..." )
514
541
time .sleep (random_sleep_time )
0 commit comments