Skip to content

Commit d80e9de

Browse files
authored
Merge pull request #10 from opsdisk/improve-search-results
Tweaked logic to improve number of search results returned
2 parents 876cedd + 5a3fbf8 commit d80e9de

File tree

2 files changed

+10
-14
lines changed

2 files changed

+10
-14
lines changed

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55

66
setuptools.setup(
77
name="yagooglesearch",
8-
version="1.4.0",
8+
version="1.5.0",
99
author="Brennon Thomas",
1010
author_email="info@opsdisk.com",
1111
description="A Python library for executing intelligent, realistic-looking, and tunable Google searches.",

yagooglesearch/__init__.py

Lines changed: 9 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212

1313
# Custom Python libraries.
1414

15-
__version__ = "1.4.0"
15+
__version__ = "1.5.0"
1616

1717
# Logging
1818
ROOT_LOGGER = logging.getLogger("yagooglesearch")
@@ -468,9 +468,7 @@ def search(self):
468468
gbar.clear()
469469
anchors = soup.find_all("a")
470470

471-
# Used to determine if another page of search results needs to be requested. If 100 search results are
472-
# requested per page, but the current page of results is less than that, no need to search the next page for
473-
# results because there won't be any. Prevents fruitless queries and costing a pointless search request.
471+
# Tracks number of valid URLs found on a search page.
474472
valid_links_found_in_this_search = 0
475473

476474
# Process every anchored URL.
@@ -498,21 +496,19 @@ def search(self):
498496
ROOT_LOGGER.info(f"Found unique URL #{total_valid_links_found}: {link}")
499497
unique_urls_set.add(link)
500498

499+
else:
500+
ROOT_LOGGER.info(f"Duplicate URL found: {link}")
501+
501502
# If we reached the limit of requested URLS, return with the results.
502503
if self.max_search_result_urls_to_return <= len(unique_urls_set):
503504
# Convert to a list.
504505
self.unique_urls_list = list(unique_urls_set)
505506
return self.unique_urls_list
506507

507-
# See comment for the "valid_links_found_in_this_search" variable. This is because determining if a "Next"
508-
# URL page of results is not straightforward. For example, this can happen if
509-
# max_search_result_urls_to_return=100, but there are only 93 total possible results.
510-
if valid_links_found_in_this_search != self.num:
511-
ROOT_LOGGER.info(
512-
f"The number of valid search results ({valid_links_found_in_this_search}) was not the requested "
513-
f"max results to pull back at once num=({self.num}) for this page. That implies there won't be "
514-
"any search results on the next page either. Moving on..."
515-
)
508+
# Determining if a "Next" URL page of results is not straightforward. If no valid links are found, the
509+
# search results have been exhausted.
510+
if valid_links_found_in_this_search == 0:
511+
ROOT_LOGGER.info("No valid search results found on this page. Moving on...")
516512
# Convert to a list.
517513
self.unique_urls_list = list(unique_urls_set)
518514
return self.unique_urls_list

0 commit comments

Comments
 (0)