Skip to content

Commit ab59ec3

Browse files
authored
Merge pull request #9 from KennBro/add-output-param
Add output param
2 parents d80e9de + fde7bcb commit ab59ec3

File tree

4 files changed

+54
-19
lines changed

4 files changed

+54
-19
lines changed

README.md

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,8 +61,9 @@ client = yagooglesearch.SearchClient(
6161
max_search_result_urls_to_return=100,
6262
http_429_cool_off_time_in_minutes=45,
6363
http_429_cool_off_factor=1.5,
64-
proxy="socks5h://127.0.0.1:9050",
64+
# proxy="socks5h://127.0.0.1:9050",
6565
verbosity=5,
66+
verbose_output=True, # False (only URLs) or True (rank, title, description, and URL)
6667
)
6768
client.assign_random_user_agent()
6869

@@ -286,3 +287,7 @@ Project Link: [https://github.com/opsdisk/yagooglesearch](https://github.com/ops
286287

287288
* [Mario Vilas](https://github.com/MarioVilas) for his amazing work on the original
288289
[googlesearch](https://github.com/MarioVilas/googlesearch) library.
290+
291+
## Contributors
292+
293+
* [KennBro](https://github.com/KennBro) - <https://github.com/opsdisk/yagooglesearch/pull/9>
Loading

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55

66
setuptools.setup(
77
name="yagooglesearch",
8-
version="1.5.0",
8+
version="1.6.0",
99
author="Brennon Thomas",
1010
author_email="info@opsdisk.com",
1111
description="A Python library for executing intelligent, realistic-looking, and tunable Google searches.",

yagooglesearch/__init__.py

Lines changed: 47 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212

1313
# Custom Python libraries.
1414

15-
__version__ = "1.5.0"
15+
__version__ = "1.6.0"
1616

1717
# Logging
1818
ROOT_LOGGER = logging.getLogger("yagooglesearch")
@@ -85,6 +85,7 @@ def __init__(
8585
proxy="",
8686
verify_ssl=True,
8787
verbosity=5,
88+
verbose_output=False,
8889
):
8990

9091
"""
@@ -116,9 +117,10 @@ def __init__(
116117
:param bool verify_ssl: Verify the SSL certificate to prevent traffic interception attacks. Defaults to True.
117118
This may need to be disabled in some HTTPS proxy instances.
118119
:param int verbosity: Logging and console output verbosity.
120+
:param bool verbose_output: False (only URLs) or True (rank, title, description, and URL). Defaults to False.
119121
120122
:rtype: List of str
121-
:return: List of found URLs.
123+
:return: List of URLs found or list of {"rank", "title", "description", "url"}
122124
"""
123125

124126
self.query = urllib.parse.quote_plus(query)
@@ -139,6 +141,7 @@ def __init__(
139141
self.proxy = proxy
140142
self.verify_ssl = verify_ssl
141143
self.verbosity = verbosity
144+
self.verbose_output = verbose_output
142145

143146
# Assign log level.
144147
ROOT_LOGGER.setLevel((6 - self.verbosity) * 10)
@@ -394,11 +397,11 @@ def search(self):
394397
"""Start the Google search.
395398
396399
:rtype: List of str
397-
:return: List of URLs found
400+
:return: List of URLs found or list of {"rank", "title", "description", "url"}
398401
"""
399402

400-
# Set of URLs for the results found.
401-
unique_urls_set = set()
403+
# Consolidate search results.
404+
self.search_result_list = []
402405

403406
# Count the number of valid, non-duplicate links found.
404407
total_valid_links_found = 0
@@ -450,9 +453,8 @@ def search(self):
450453
# HTTP 429 message returned from get_page() function, add "HTTP_429_DETECTED" to the set and return to the
451454
# calling script.
452455
if html == "HTTP_429_DETECTED":
453-
unique_urls_set.add("HTTP_429_DETECTED")
454-
self.unique_urls_list = list(unique_urls_set)
455-
return self.unique_urls_list
456+
self.search_result_list.append("HTTP_429_DETECTED")
457+
return self.search_result_list
456458

457459
# Create the BeautifulSoup object.
458460
soup = BeautifulSoup(html, "html.parser")
@@ -486,32 +488,60 @@ def search(self):
486488
if not link:
487489
continue
488490

491+
if self.verbose_output:
492+
493+
# Extract the URL title.
494+
try:
495+
title = a.get_text()
496+
except Exception:
497+
ROOT_LOGGER.warning(f"No title for link: {link}")
498+
title = ""
499+
500+
# Extract the URL description.
501+
try:
502+
description = a.parent.parent.contents[1].get_text()
503+
504+
# Sometimes Google returns different structures.
505+
if description == "":
506+
description = a.parent.parent.contents[2].get_text()
507+
508+
except Exception:
509+
ROOT_LOGGER.warning(f"No description for link: {link}")
510+
description = ""
511+
489512
# Check if URL has already been found.
490-
if link not in unique_urls_set:
513+
if link not in self.search_result_list:
491514

492515
# Increase the counters.
493516
valid_links_found_in_this_search += 1
494517
total_valid_links_found += 1
495518

496519
ROOT_LOGGER.info(f"Found unique URL #{total_valid_links_found}: {link}")
497-
unique_urls_set.add(link)
520+
521+
if self.verbose_output:
522+
self.search_result_list.append(
523+
{
524+
"rank": total_valid_links_found, # Approximate rank according to yagooglesearch.
525+
"title": title.strip(), # Remove leading and trailing spaces.
526+
"description": description.strip(), # Remove leading and trailing spaces.
527+
"url": link,
528+
}
529+
)
530+
else:
531+
self.search_result_list.append(link)
498532

499533
else:
500534
ROOT_LOGGER.info(f"Duplicate URL found: {link}")
501535

502536
# If we reached the limit of requested URLS, return with the results.
503-
if self.max_search_result_urls_to_return <= len(unique_urls_set):
504-
# Convert to a list.
505-
self.unique_urls_list = list(unique_urls_set)
506-
return self.unique_urls_list
537+
if self.max_search_result_urls_to_return <= len(self.search_result_list):
538+
return self.search_result_list
507539

508540
# Determining if a "Next" URL page of results is not straightforward. If no valid links are found, the
509541
# search results have been exhausted.
510542
if valid_links_found_in_this_search == 0:
511543
ROOT_LOGGER.info("No valid search results found on this page. Moving on...")
512-
# Convert to a list.
513-
self.unique_urls_list = list(unique_urls_set)
514-
return self.unique_urls_list
544+
return self.search_result_list
515545

516546
# Bump the starting page URL parameter for the next request.
517547
self.start += self.num

0 commit comments

Comments
 (0)