File tree Expand file tree Collapse file tree 2 files changed +41
-5
lines changed Expand file tree Collapse file tree 2 files changed +41
-5
lines changed Original file line number Diff line number Diff line change @@ -97,7 +97,40 @@ using it.
97
97
98
98
If you do not want ` yagooglesearch ` to handle HTTP 429 cool off period and would rather handle it yourself, pass
99
99
` yagooglesearch_manages_http_429s=False ` when instantiating the yagooglesearch object. If an HTTP 429 is detected, the
100
- string "HTTP_429_DETECTED" will be returned and it's up to you on what the next step should be.
100
+ string "HTTP_429_DETECTED" is added to a list object that will be returned, and it's up to you on what the next step
101
+ should be. The list object will contain all the URLs found before the HTTP 429 was detected.
102
+
103
+ ``` python
104
+ import yagooglesearch
105
+
106
+ query = " site:twitter.com"
107
+
108
+ client = yagooglesearch.SearchClient(
109
+ query,
110
+ tbs = " li:1" ,
111
+ verbosity = 4 ,
112
+ num = 10 ,
113
+ max_search_result_urls_to_return = 1000 ,
114
+ minimum_delay_between_paged_results_in_seconds = 1 ,
115
+ yagooglesearch_manages_http_429s = False , # Add to manage HTTP 429s.
116
+ )
117
+ client.assign_random_user_agent()
118
+
119
+ urls = client.search()
120
+
121
+ if " HTTP_429_DETECTED" in urls:
122
+ print (" HTTP 429 detected...it's up to you to modify your search." )
123
+
124
+ # Remove HTTP_429_DETECTED from list.
125
+ urls.remove(" HTTP_429_DETECTED" )
126
+
127
+ print (" URLs found before HTTP 429 detected..." )
128
+
129
+ for url in urls:
130
+ print (url)
131
+ ```
132
+
133
+ ![ http429_detection_string_in_returned_list.png] ( img/http429_detection_string_in_returned_list.png )
101
134
102
135
## HTTP and SOCKS5 proxy support
103
136
Original file line number Diff line number Diff line change @@ -376,7 +376,7 @@ def get_page(self, url):
376
376
# notification string.
377
377
if not self .yagooglesearch_manages_http_429s :
378
378
ROOT_LOGGER .info ("Since yagooglesearch_manages_http_429s=False, yagooglesearch is done." )
379
- return "HTTP_429_detected "
379
+ return "HTTP_429_DETECTED "
380
380
381
381
ROOT_LOGGER .info (f"Sleeping for { self .http_429_cool_off_time_in_minutes } minutes..." )
382
382
time .sleep (self .http_429_cool_off_time_in_minutes * 60 )
@@ -447,9 +447,12 @@ def search(self):
447
447
# Request Google search results.
448
448
html = self .get_page (url )
449
449
450
- # HTTP 429 message returned from get_page() function, return to calling script.
451
- if html == "HTTP_429_detected" :
452
- return "HTTP_429_detected"
450
+ # HTTP 429 message returned from get_page() function, add "HTTP_429_DETECTED" to the set and return to the
451
+ # calling script.
452
+ if html == "HTTP_429_DETECTED" :
453
+ unique_urls_set .add ("HTTP_429_DETECTED" )
454
+ self .unique_urls_list = list (unique_urls_set )
455
+ return self .unique_urls_list
453
456
454
457
# Create the BeautifulSoup object.
455
458
soup = BeautifulSoup (html , "html.parser" )
You can’t perform that action at this time.
0 commit comments