Skip to content

Commit f2fb252

Browse files
committed
Logic, formatting, and variable cleanup
1 parent ecb461f commit f2fb252

File tree

1 file changed

+44
-49
lines changed

1 file changed

+44
-49
lines changed

yagooglesearch/__init__.py

Lines changed: 44 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212

1313
# Custom Python libraries.
1414

15-
__version__ = "1.5.0"
15+
__version__ = "1.6.0"
1616

1717
# Logging
1818
ROOT_LOGGER = logging.getLogger("yagooglesearch")
@@ -85,7 +85,7 @@ def __init__(
8585
proxy="",
8686
verify_ssl=True,
8787
verbosity=5,
88-
output="normal",
88+
verbose_output=False,
8989
):
9090

9191
"""
@@ -117,10 +117,10 @@ def __init__(
117117
:param bool verify_ssl: Verify the SSL certificate to prevent traffic interception attacks. Defaults to True.
118118
This may need to be disabled in some HTTPS proxy instances.
119119
:param int verbosity: Logging and console output verbosity.
120-
:param str output: "normal" (Only URLs) or "complete" (Title, Description and urls)
120+
:param bool verbose_output: False (only URLs) or True (rank, title, description, and URL). Defaults to False.
121121
122122
:rtype: List of str
123-
:return: List of found URLs.
123+
:return: List of URLs found or list of {"rank", "title", "description", "url"}
124124
"""
125125

126126
self.query = urllib.parse.quote_plus(query)
@@ -141,7 +141,7 @@ def __init__(
141141
self.proxy = proxy
142142
self.verify_ssl = verify_ssl
143143
self.verbosity = verbosity
144-
self.output = output
144+
self.verbose_output = verbose_output
145145

146146
# Assign log level.
147147
ROOT_LOGGER.setLevel((6 - self.verbosity) * 10)
@@ -397,12 +397,11 @@ def search(self):
397397
"""Start the Google search.
398398
399399
:rtype: List of str
400-
:return: List of URLs found or List of {"title", "desc", "url"}
400+
:return: List of URLs found or list of {"rank", "title", "description", "url"}
401401
"""
402402

403-
# Set of URLs for the results found.
404-
unique_urls_set = set()
405-
unique_complete_result = []
403+
# Consolidate search results.
404+
self.search_result_list = []
406405

407406
# Count the number of valid, non-duplicate links found.
408407
total_valid_links_found = 0
@@ -454,9 +453,8 @@ def search(self):
454453
# HTTP 429 message returned from get_page() function, add "HTTP_429_DETECTED" to the set and return to the
455454
# calling script.
456455
if html == "HTTP_429_DETECTED":
457-
unique_urls_set.add("HTTP_429_DETECTED")
458-
self.unique_urls_list = list(unique_urls_set)
459-
return self.unique_urls_list
456+
self.search_result_list.append("HTTP_429_DETECTED")
457+
return self.search_result_list
460458

461459
# Create the BeautifulSoup object.
462460
soup = BeautifulSoup(html, "html.parser")
@@ -485,68 +483,65 @@ def search(self):
485483
ROOT_LOGGER.warning(f"No href for link: {link}")
486484
continue
487485

488-
if (self.output == "complete"):
489-
# Get the first SPAN from the anchor tag.
486+
# Filter invalid links and links pointing to Google itself.
487+
link = self.filter_search_result_urls(link)
488+
if not link:
489+
continue
490+
491+
if self.verbose_output:
492+
493+
# Extract the URL title.
490494
try:
491495
title = a.get_text()
492496
except Exception:
493-
ROOT_LOGGER.warning(f"No title and desc for link")
494-
title = ''
495-
continue
497+
ROOT_LOGGER.warning(f"No title for link: {link}")
498+
title = ""
496499

500+
# Extract the URL description.
497501
try:
498-
desc = a.parent.parent.contents[1].get_text()
499-
# Sometimes google returns different structures
500-
if (desc == ''):
501-
desc = a.parent.parent.contents[2].get_text()
502-
except Exception:
503-
ROOT_LOGGER.warning(f"No title and desc for link")
504-
desc = ''
505-
continue
502+
description = a.parent.parent.contents[1].get_text()
506503

507-
# Filter invalid links and links pointing to Google itself.
508-
link = self.filter_search_result_urls(link)
509-
if not link:
510-
continue
504+
# Sometimes Google returns different structures.
505+
if description == "":
506+
description = a.parent.parent.contents[2].get_text()
507+
508+
except Exception:
509+
ROOT_LOGGER.warning(f"No description for link: {link}")
510+
description = ""
511511

512512
# Check if URL has already been found.
513-
if link not in unique_urls_set:
513+
if link not in self.search_result_list:
514514

515515
# Increase the counters.
516516
valid_links_found_in_this_search += 1
517517
total_valid_links_found += 1
518518

519519
ROOT_LOGGER.info(f"Found unique URL #{total_valid_links_found}: {link}")
520-
unique_urls_set.add(link)
521520

522-
if (self.output == "complete"):
523-
unique_complete_result.append({"title": title,
524-
"desc": desc,
525-
"url": link})
521+
if self.verbose_output:
522+
self.search_result_list.append(
523+
{
524+
"rank": total_valid_links_found, # Approximate rank according to yagooglesearch.
525+
"title": title.strip(), # Remove leading and trailing spaces.
526+
"description": description.strip(), # Remove leading and trailing spaces.
527+
"url": link,
528+
}
529+
)
530+
else:
531+
self.search_result_list.append(link)
526532

527533
else:
528534
ROOT_LOGGER.info(f"Duplicate URL found: {link}")
529535

530536
# If we reached the limit of requested URLS, return with the results.
531-
if self.max_search_result_urls_to_return <= len(unique_urls_set):
532-
if (self.output == "complete"):
533-
return unique_complete_result
534-
else:
535-
# Convert to a list.
536-
self.unique_urls_list = list(unique_urls_set)
537-
return self.unique_urls_list
537+
if self.max_search_result_urls_to_return <= len(self.search_result_list):
538+
return self.search_result_list
538539

539540
# Determining if a "Next" URL page of results is not straightforward. If no valid links are found, the
540541
# search results have been exhausted.
541542
if valid_links_found_in_this_search == 0:
542543
ROOT_LOGGER.info("No valid search results found on this page. Moving on...")
543-
# Convert to a list.
544-
if (self.output == "complete"):
545-
return unique_complete_result
546-
else:
547-
# Convert to a list.
548-
self.unique_urls_list = list(unique_urls_set)
549-
return self.unique_urls_list
544+
return self.search_result_list
550545

551546
# Bump the starting page URL parameter for the next request.
552547
self.start += self.num

0 commit comments

Comments
 (0)