Skip to content

Commit c4cac70

Browse files
committed
Minor fixes, formatting, and documenation updates
1 parent 4be1c20 commit c4cac70

File tree

2 files changed

+20
-15
lines changed

2 files changed

+20
-15
lines changed

README.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -242,6 +242,11 @@ for search_query in search_queries:
242242
proxy_rotation_index += 1
243243
```
244244

245+
## GOOGLE_ABUSE_EXEMPTION cookie
246+
247+
If you have a `GOOGLE_ABUSE_EXEMPTION` cookie value, it can be passed into `google_exemption` when instantiating the
248+
`SearchClient` object.
249+
245250
## &tbs= URL filter clarification
246251

247252
The `&tbs=` parameter is used to specify either verbatim or time-based filters.
@@ -291,3 +296,4 @@ Project Link: [https://github.com/opsdisk/yagooglesearch](https://github.com/ops
291296
## Contributors
292297

293298
* [KennBro](https://github.com/KennBro) - <https://github.com/opsdisk/yagooglesearch/pull/9>
299+
* [ArshansGithub](https://github.com/ArshansGithub) - <https://github.com/opsdisk/yagooglesearch/pull/21>

yagooglesearch/__init__.py

Lines changed: 14 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212

1313
# Custom Python libraries.
1414

15-
__version__ = "1.6.1"
15+
__version__ = "1.7.0"
1616

1717
# Logging
1818
ROOT_LOGGER = logging.getLogger("yagooglesearch")
@@ -86,9 +86,8 @@ def __init__(
8686
verify_ssl=True,
8787
verbosity=5,
8888
verbose_output=False,
89-
google_exemption=None
89+
google_exemption=None,
9090
):
91-
9291
"""
9392
SearchClient
9493
:param str query: Query string. Must NOT be url-encoded.
@@ -119,8 +118,9 @@ def __init__(
119118
This may need to be disabled in some HTTPS proxy instances.
120119
:param int verbosity: Logging and console output verbosity.
121120
:param bool verbose_output: False (only URLs) or True (rank, title, description, and URL). Defaults to False.
122-
:param str google_exemption: Google cookie exemption string. This is a string that Google uses to allow certain google searches. Defaults to None
123-
121+
:param str google_exemption: Google cookie exemption string. This is a string that Google uses to allow certain
122+
google searches. Defaults to None.
123+
124124
:rtype: List of str
125125
:return: List of URLs found or list of {"rank", "title", "description", "url"}
126126
"""
@@ -154,9 +154,10 @@ def __init__(
154154
ROOT_LOGGER.warning("The largest value allowed by Google for num is 100. Setting num to 100.")
155155
self.num = 100
156156

157-
# Initialize cookies to None, will be updated with each request in get_page().
157+
# Populate cookies with GOOGLE_ABUSE_EXEMPTION if it is provided. Otherwise, initialize cookies to None.
158+
# It will be updated with each request in get_page().
158159
if self.google_exemption:
159-
self.cookies = {'GOOGLE_ABUSE_EXEMPTION': self.google_exemption}
160+
self.cookies = {"GOOGLE_ABUSE_EXEMPTION": self.google_exemption}
160161
else:
161162
self.cookies = None
162163

@@ -184,7 +185,6 @@ def __init__(
184185

185186
# Update proxy_dict if a proxy is provided.
186187
if proxy:
187-
188188
# Standardize case since the scheme will be checked against a hard-coded list.
189189
self.proxy = proxy.lower()
190190

@@ -327,7 +327,12 @@ def get_page(self, url):
327327

328328
ROOT_LOGGER.info(f"Requesting URL: {url}")
329329
response = requests.get(
330-
url, proxies=self.proxy_dict, headers=headers, cookies=self.cookies, timeout=15, verify=self.verify_ssl
330+
url,
331+
proxies=self.proxy_dict,
332+
headers=headers,
333+
cookies=self.cookies,
334+
timeout=15,
335+
verify=self.verify_ssl,
331336
)
332337

333338
# Update the cookies.
@@ -347,7 +352,6 @@ def get_page(self, url):
347352
# See https://github.com/benbusby/whoogle-search/issues/311
348353
try:
349354
if response.cookies["CONSENT"].startswith("PENDING+"):
350-
351355
ROOT_LOGGER.warning(
352356
"Looks like your IP address is sourcing from a European Union location...your search results may "
353357
"vary, but I'll try and work around this by updating the cookie."
@@ -387,7 +391,6 @@ def get_page(self, url):
387391
html = response.text
388392

389393
elif http_response_code == 429:
390-
391394
ROOT_LOGGER.warning("Google is blocking your IP for making too many requests in a specific time period.")
392395

393396
# Calling script does not want yagooglesearch to handle HTTP 429 cool off and retry. Just return a
@@ -437,7 +440,6 @@ def search(self):
437440
# Loop until we reach the maximum result results found or there are no more search results found to reach
438441
# max_search_result_urls_to_return.
439442
while total_valid_links_found <= self.max_search_result_urls_to_return:
440-
441443
ROOT_LOGGER.info(
442444
f"Stats: start={self.start}, num={self.num}, total_valid_links_found={total_valid_links_found} / "
443445
f"max_search_result_urls_to_return={self.max_search_result_urls_to_return}"
@@ -490,7 +492,6 @@ def search(self):
490492

491493
# Process every anchored URL.
492494
for a in anchors:
493-
494495
# Get the URL from the anchor tag.
495496
try:
496497
link = a["href"]
@@ -504,7 +505,6 @@ def search(self):
504505
continue
505506

506507
if self.verbose_output:
507-
508508
# Extract the URL title.
509509
try:
510510
title = a.get_text()
@@ -526,7 +526,6 @@ def search(self):
526526

527527
# Check if URL has already been found.
528528
if link not in self.search_result_list:
529-
530529
# Increase the counters.
531530
valid_links_found_in_this_search += 1
532531
total_valid_links_found += 1

0 commit comments

Comments
 (0)