12
12
13
13
# Custom Python libraries.
14
14
15
- __version__ = "1.6.1 "
15
+ __version__ = "1.7.0 "
16
16
17
17
# Logging
18
18
ROOT_LOGGER = logging .getLogger ("yagooglesearch" )
@@ -86,9 +86,8 @@ def __init__(
86
86
verify_ssl = True ,
87
87
verbosity = 5 ,
88
88
verbose_output = False ,
89
- google_exemption = None
89
+ google_exemption = None ,
90
90
):
91
-
92
91
"""
93
92
SearchClient
94
93
:param str query: Query string. Must NOT be url-encoded.
@@ -119,8 +118,9 @@ def __init__(
119
118
This may need to be disabled in some HTTPS proxy instances.
120
119
:param int verbosity: Logging and console output verbosity.
121
120
:param bool verbose_output: False (only URLs) or True (rank, title, description, and URL). Defaults to False.
122
- :param str google_exemption: Google cookie exemption string. This is a string that Google uses to allow certain google searches. Defaults to None
123
-
121
+ :param str google_exemption: Google cookie exemption string. This is a string that Google uses to allow certain
122
+ google searches. Defaults to None.
123
+
124
124
:rtype: List of str
125
125
:return: List of URLs found or list of {"rank", "title", "description", "url"}
126
126
"""
@@ -154,9 +154,10 @@ def __init__(
154
154
ROOT_LOGGER .warning ("The largest value allowed by Google for num is 100. Setting num to 100." )
155
155
self .num = 100
156
156
157
- # Initialize cookies to None, will be updated with each request in get_page().
157
+ # Populate cookies with GOOGLE_ABUSE_EXEMPTION if it is provided. Otherwise, initialize cookies to None.
158
+ # It will be updated with each request in get_page().
158
159
if self .google_exemption :
159
- self .cookies = {' GOOGLE_ABUSE_EXEMPTION' : self .google_exemption }
160
+ self .cookies = {" GOOGLE_ABUSE_EXEMPTION" : self .google_exemption }
160
161
else :
161
162
self .cookies = None
162
163
@@ -184,7 +185,6 @@ def __init__(
184
185
185
186
# Update proxy_dict if a proxy is provided.
186
187
if proxy :
187
-
188
188
# Standardize case since the scheme will be checked against a hard-coded list.
189
189
self .proxy = proxy .lower ()
190
190
@@ -327,7 +327,12 @@ def get_page(self, url):
327
327
328
328
ROOT_LOGGER .info (f"Requesting URL: { url } " )
329
329
response = requests .get (
330
- url , proxies = self .proxy_dict , headers = headers , cookies = self .cookies , timeout = 15 , verify = self .verify_ssl
330
+ url ,
331
+ proxies = self .proxy_dict ,
332
+ headers = headers ,
333
+ cookies = self .cookies ,
334
+ timeout = 15 ,
335
+ verify = self .verify_ssl ,
331
336
)
332
337
333
338
# Update the cookies.
@@ -347,7 +352,6 @@ def get_page(self, url):
347
352
# See https://github.com/benbusby/whoogle-search/issues/311
348
353
try :
349
354
if response .cookies ["CONSENT" ].startswith ("PENDING+" ):
350
-
351
355
ROOT_LOGGER .warning (
352
356
"Looks like your IP address is sourcing from a European Union location...your search results may "
353
357
"vary, but I'll try and work around this by updating the cookie."
@@ -387,7 +391,6 @@ def get_page(self, url):
387
391
html = response .text
388
392
389
393
elif http_response_code == 429 :
390
-
391
394
ROOT_LOGGER .warning ("Google is blocking your IP for making too many requests in a specific time period." )
392
395
393
396
# Calling script does not want yagooglesearch to handle HTTP 429 cool off and retry. Just return a
@@ -437,7 +440,6 @@ def search(self):
437
440
# Loop until we reach the maximum result results found or there are no more search results found to reach
438
441
# max_search_result_urls_to_return.
439
442
while total_valid_links_found <= self .max_search_result_urls_to_return :
440
-
441
443
ROOT_LOGGER .info (
442
444
f"Stats: start={ self .start } , num={ self .num } , total_valid_links_found={ total_valid_links_found } / "
443
445
f"max_search_result_urls_to_return={ self .max_search_result_urls_to_return } "
@@ -490,7 +492,6 @@ def search(self):
490
492
491
493
# Process every anchored URL.
492
494
for a in anchors :
493
-
494
495
# Get the URL from the anchor tag.
495
496
try :
496
497
link = a ["href" ]
@@ -504,7 +505,6 @@ def search(self):
504
505
continue
505
506
506
507
if self .verbose_output :
507
-
508
508
# Extract the URL title.
509
509
try :
510
510
title = a .get_text ()
@@ -526,7 +526,6 @@ def search(self):
526
526
527
527
# Check if URL has already been found.
528
528
if link not in self .search_result_list :
529
-
530
529
# Increase the counters.
531
530
valid_links_found_in_this_search += 1
532
531
total_valid_links_found += 1
0 commit comments