12
12
13
13
# Custom Python libraries.
14
14
15
- __version__ = "1.5 .0"
15
+ __version__ = "1.6 .0"
16
16
17
17
# Logging
18
18
ROOT_LOGGER = logging .getLogger ("yagooglesearch" )
@@ -85,6 +85,7 @@ def __init__(
85
85
proxy = "" ,
86
86
verify_ssl = True ,
87
87
verbosity = 5 ,
88
+ verbose_output = False ,
88
89
):
89
90
90
91
"""
@@ -116,9 +117,10 @@ def __init__(
116
117
:param bool verify_ssl: Verify the SSL certificate to prevent traffic interception attacks. Defaults to True.
117
118
This may need to be disabled in some HTTPS proxy instances.
118
119
:param int verbosity: Logging and console output verbosity.
120
+ :param bool verbose_output: False (only URLs) or True (rank, title, description, and URL). Defaults to False.
119
121
120
122
:rtype: List of str
121
- :return: List of found URLs.
123
+ :return: List of URLs found or list of {"rank", "title", "description", "url"}
122
124
"""
123
125
124
126
self .query = urllib .parse .quote_plus (query )
@@ -139,6 +141,7 @@ def __init__(
139
141
self .proxy = proxy
140
142
self .verify_ssl = verify_ssl
141
143
self .verbosity = verbosity
144
+ self .verbose_output = verbose_output
142
145
143
146
# Assign log level.
144
147
ROOT_LOGGER .setLevel ((6 - self .verbosity ) * 10 )
@@ -394,11 +397,11 @@ def search(self):
394
397
"""Start the Google search.
395
398
396
399
:rtype: List of str
397
- :return: List of URLs found
400
+ :return: List of URLs found or list of {"rank", "title", "description", "url"}
398
401
"""
399
402
400
- # Set of URLs for the results found .
401
- unique_urls_set = set ()
403
+ # Consolidate search results.
404
+ self . search_result_list = []
402
405
403
406
# Count the number of valid, non-duplicate links found.
404
407
total_valid_links_found = 0
@@ -450,9 +453,8 @@ def search(self):
450
453
# HTTP 429 message returned from get_page() function, add "HTTP_429_DETECTED" to the set and return to the
451
454
# calling script.
452
455
if html == "HTTP_429_DETECTED" :
453
- unique_urls_set .add ("HTTP_429_DETECTED" )
454
- self .unique_urls_list = list (unique_urls_set )
455
- return self .unique_urls_list
456
+ self .search_result_list .append ("HTTP_429_DETECTED" )
457
+ return self .search_result_list
456
458
457
459
# Create the BeautifulSoup object.
458
460
soup = BeautifulSoup (html , "html.parser" )
@@ -486,32 +488,60 @@ def search(self):
486
488
if not link :
487
489
continue
488
490
491
+ if self .verbose_output :
492
+
493
+ # Extract the URL title.
494
+ try :
495
+ title = a .get_text ()
496
+ except Exception :
497
+ ROOT_LOGGER .warning (f"No title for link: { link } " )
498
+ title = ""
499
+
500
+ # Extract the URL description.
501
+ try :
502
+ description = a .parent .parent .contents [1 ].get_text ()
503
+
504
+ # Sometimes Google returns different structures.
505
+ if description == "" :
506
+ description = a .parent .parent .contents [2 ].get_text ()
507
+
508
+ except Exception :
509
+ ROOT_LOGGER .warning (f"No description for link: { link } " )
510
+ description = ""
511
+
489
512
# Check if URL has already been found.
490
- if link not in unique_urls_set :
513
+ if link not in self . search_result_list :
491
514
492
515
# Increase the counters.
493
516
valid_links_found_in_this_search += 1
494
517
total_valid_links_found += 1
495
518
496
519
ROOT_LOGGER .info (f"Found unique URL #{ total_valid_links_found } : { link } " )
497
- unique_urls_set .add (link )
520
+
521
+ if self .verbose_output :
522
+ self .search_result_list .append (
523
+ {
524
+ "rank" : total_valid_links_found , # Approximate rank according to yagooglesearch.
525
+ "title" : title .strip (), # Remove leading and trailing spaces.
526
+ "description" : description .strip (), # Remove leading and trailing spaces.
527
+ "url" : link ,
528
+ }
529
+ )
530
+ else :
531
+ self .search_result_list .append (link )
498
532
499
533
else :
500
534
ROOT_LOGGER .info (f"Duplicate URL found: { link } " )
501
535
502
536
# If we reached the limit of requested URLS, return with the results.
503
- if self .max_search_result_urls_to_return <= len (unique_urls_set ):
504
- # Convert to a list.
505
- self .unique_urls_list = list (unique_urls_set )
506
- return self .unique_urls_list
537
+ if self .max_search_result_urls_to_return <= len (self .search_result_list ):
538
+ return self .search_result_list
507
539
508
540
# Determining if a "Next" URL page of results is not straightforward. If no valid links are found, the
509
541
# search results have been exhausted.
510
542
if valid_links_found_in_this_search == 0 :
511
543
ROOT_LOGGER .info ("No valid search results found on this page. Moving on..." )
512
- # Convert to a list.
513
- self .unique_urls_list = list (unique_urls_set )
514
- return self .unique_urls_list
544
+ return self .search_result_list
515
545
516
546
# Bump the starting page URL parameter for the next request.
517
547
self .start += self .num
0 commit comments