12
12
13
13
# Custom Python libraries.
14
14
15
- __version__ = "1.5 .0"
15
+ __version__ = "1.6 .0"
16
16
17
17
# Logging
18
18
ROOT_LOGGER = logging .getLogger ("yagooglesearch" )
@@ -85,7 +85,7 @@ def __init__(
85
85
proxy = "" ,
86
86
verify_ssl = True ,
87
87
verbosity = 5 ,
88
- output = "normal" ,
88
+ verbose_output = False ,
89
89
):
90
90
91
91
"""
@@ -117,10 +117,10 @@ def __init__(
117
117
:param bool verify_ssl: Verify the SSL certificate to prevent traffic interception attacks. Defaults to True.
118
118
This may need to be disabled in some HTTPS proxy instances.
119
119
:param int verbosity: Logging and console output verbosity.
120
- :param str output: "normal" (Only URLs) or "complete" (Title, Description and urls)
120
+ :param bool verbose_output: False (only URLs) or True (rank, title, description, and URL). Defaults to False.
121
121
122
122
:rtype: List of str
123
- :return: List of found URLs.
123
+ :return: List of URLs found or list of {"rank", "title", "description", "url"}
124
124
"""
125
125
126
126
self .query = urllib .parse .quote_plus (query )
@@ -141,7 +141,7 @@ def __init__(
141
141
self .proxy = proxy
142
142
self .verify_ssl = verify_ssl
143
143
self .verbosity = verbosity
144
- self .output = output
144
+ self .verbose_output = verbose_output
145
145
146
146
# Assign log level.
147
147
ROOT_LOGGER .setLevel ((6 - self .verbosity ) * 10 )
@@ -397,12 +397,11 @@ def search(self):
397
397
"""Start the Google search.
398
398
399
399
:rtype: List of str
400
- :return: List of URLs found or List of {"title", "desc ", "url"}
400
+ :return: List of URLs found or list of {"rank", " title", "description ", "url"}
401
401
"""
402
402
403
- # Set of URLs for the results found.
404
- unique_urls_set = set ()
405
- unique_complete_result = []
403
+ # Consolidate search results.
404
+ self .search_result_list = []
406
405
407
406
# Count the number of valid, non-duplicate links found.
408
407
total_valid_links_found = 0
@@ -454,9 +453,8 @@ def search(self):
454
453
# HTTP 429 message returned from get_page() function, add "HTTP_429_DETECTED" to the set and return to the
455
454
# calling script.
456
455
if html == "HTTP_429_DETECTED" :
457
- unique_urls_set .add ("HTTP_429_DETECTED" )
458
- self .unique_urls_list = list (unique_urls_set )
459
- return self .unique_urls_list
456
+ self .search_result_list .append ("HTTP_429_DETECTED" )
457
+ return self .search_result_list
460
458
461
459
# Create the BeautifulSoup object.
462
460
soup = BeautifulSoup (html , "html.parser" )
@@ -485,68 +483,65 @@ def search(self):
485
483
ROOT_LOGGER .warning (f"No href for link: { link } " )
486
484
continue
487
485
488
- if (self .output == "complete" ):
489
- # Get the first SPAN from the anchor tag.
486
+ # Filter invalid links and links pointing to Google itself.
487
+ link = self .filter_search_result_urls (link )
488
+ if not link :
489
+ continue
490
+
491
+ if self .verbose_output :
492
+
493
+ # Extract the URL title.
490
494
try :
491
495
title = a .get_text ()
492
496
except Exception :
493
- ROOT_LOGGER .warning (f"No title and desc for link" )
494
- title = ''
495
- continue
497
+ ROOT_LOGGER .warning (f"No title for link: { link } " )
498
+ title = ""
496
499
500
+ # Extract the URL description.
497
501
try :
498
- desc = a .parent .parent .contents [1 ].get_text ()
499
- # Sometimes google returns different structures
500
- if (desc == '' ):
501
- desc = a .parent .parent .contents [2 ].get_text ()
502
- except Exception :
503
- ROOT_LOGGER .warning (f"No title and desc for link" )
504
- desc = ''
505
- continue
502
+ description = a .parent .parent .contents [1 ].get_text ()
506
503
507
- # Filter invalid links and links pointing to Google itself.
508
- link = self .filter_search_result_urls (link )
509
- if not link :
510
- continue
504
+ # Sometimes Google returns different structures.
505
+ if description == "" :
506
+ description = a .parent .parent .contents [2 ].get_text ()
507
+
508
+ except Exception :
509
+ ROOT_LOGGER .warning (f"No description for link: { link } " )
510
+ description = ""
511
511
512
512
# Check if URL has already been found.
513
- if link not in unique_urls_set :
513
+ if link not in self . search_result_list :
514
514
515
515
# Increase the counters.
516
516
valid_links_found_in_this_search += 1
517
517
total_valid_links_found += 1
518
518
519
519
ROOT_LOGGER .info (f"Found unique URL #{ total_valid_links_found } : { link } " )
520
- unique_urls_set .add (link )
521
520
522
- if (self .output == "complete" ):
523
- unique_complete_result .append ({"title" : title ,
524
- "desc" : desc ,
525
- "url" : link })
521
+ if self .verbose_output :
522
+ self .search_result_list .append (
523
+ {
524
+ "rank" : total_valid_links_found , # Approximate rank according to yagooglesearch.
525
+ "title" : title .strip (), # Remove leading and trailing spaces.
526
+ "description" : description .strip (), # Remove leading and trailing spaces.
527
+ "url" : link ,
528
+ }
529
+ )
530
+ else :
531
+ self .search_result_list .append (link )
526
532
527
533
else :
528
534
ROOT_LOGGER .info (f"Duplicate URL found: { link } " )
529
535
530
536
# If we reached the limit of requested URLS, return with the results.
531
- if self .max_search_result_urls_to_return <= len (unique_urls_set ):
532
- if (self .output == "complete" ):
533
- return unique_complete_result
534
- else :
535
- # Convert to a list.
536
- self .unique_urls_list = list (unique_urls_set )
537
- return self .unique_urls_list
537
+ if self .max_search_result_urls_to_return <= len (self .search_result_list ):
538
+ return self .search_result_list
538
539
539
540
# Determining if a "Next" URL page of results is not straightforward. If no valid links are found, the
540
541
# search results have been exhausted.
541
542
if valid_links_found_in_this_search == 0 :
542
543
ROOT_LOGGER .info ("No valid search results found on this page. Moving on..." )
543
- # Convert to a list.
544
- if (self .output == "complete" ):
545
- return unique_complete_result
546
- else :
547
- # Convert to a list.
548
- self .unique_urls_list = list (unique_urls_set )
549
- return self .unique_urls_list
544
+ return self .search_result_list
550
545
551
546
# Bump the starting page URL parameter for the next request.
552
547
self .start += self .num
0 commit comments