Skip to content

Commit 113a088

Browse files
authored
Merge pull request #4 from opsdisk/issue-3-make-ssl-verify-optional
Added verify_ssl option
2 parents eb23182 + 24a6666 commit 113a088

File tree

3 files changed

+62
-11
lines changed

3 files changed

+62
-11
lines changed

README.md

Lines changed: 41 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ heavily based off the [googlesearch](https://github.com/MarioVilas/googlesearch)
1313
* Randomizing delay times between retrieving paged search results (i.e., clicking on page 2 for more results)
1414
* HTTP(S) and SOCKS5 proxy support
1515
* Leveraging `requests` library for HTTP requests and cookie management
16+
* Adds "&filter=0" by default to search URLs to prevent any omission or filtering of search results by Google
1617
* Console and file logging
1718
* Python 3.6+
1819

@@ -61,7 +62,7 @@ client = yagooglesearch.SearchClient(
6162
http_429_cool_off_time_in_minutes=45,
6263
http_429_cool_off_factor=1.5,
6364
proxy="socks5h://127.0.0.1:9050",
64-
verbosity=5
65+
verbosity=5,
6566
)
6667
client.assign_random_user_agent()
6768

@@ -123,6 +124,40 @@ Supported proxy schemes are based off those supported in the Python `requests` l
123124
* `socks5h` - "If you want to resolve the domains on the proxy server, use socks5h as the scheme." This is the
124125
**best** option if you are using SOCKS because the DNS lookup and Google search is sourced from the proxy IP address.
125126

127+
## HTTPS proxies and SSL/TLS certificates
128+
129+
If you are using a self-signed certificate for an HTTPS proxy, you will likely need to disable SSL/TLS verification when
130+
either:
131+
132+
1) Instantiating the `yagooglesearch.SearchClient` object:
133+
134+
```python
135+
import yagooglesearch
136+
137+
query = "site:github.com"
138+
139+
client = yagooglesearch.SearchClient(
140+
query,
141+
proxy="http://127.0.0.1:8080",
142+
verify_ssl=False,
143+
verbosity=5,
144+
)
145+
```
146+
147+
2) or after instantiation:
148+
149+
```python
150+
query = "site:github.com"
151+
152+
client = yagooglesearch.SearchClient(
153+
query,
154+
proxy="http://127.0.0.1:8080",
155+
verbosity=5,
156+
)
157+
158+
client.verify_ssl = False
159+
```
160+
126161
## Multiple proxies
127162

128163
If you want to use multiple proxies, that burden is on the script utilizing the `yagooglesearch` library to instantiate
@@ -135,7 +170,7 @@ import yagooglesearch
135170
proxies = [
136171
"socks5h://127.0.0.1:9050",
137172
"socks5h://127.0.0.1:9051",
138-
"socks5h://127.0.0.1:9052",
173+
"http://127.0.0.1:9052", # HTTPS proxy with a self-signed SSL/TLS certificate.
139174
]
140175

141176
search_queries = [
@@ -158,6 +193,10 @@ for search_query in search_queries:
158193
proxy=proxies[proxy_index],
159194
)
160195

196+
# Only disable SSL/TLS verification for the HTTPS proxy using a self-signed certificate.
197+
if proxies[proxy_index].startswith("http://"):
198+
client.verify_ssl = False
199+
161200
urls_list = client.search()
162201

163202
print(urls_list)

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55

66
setuptools.setup(
77
name="yagooglesearch",
8-
version="1.1.0",
8+
version="1.2.0",
99
author="Brennon Thomas",
1010
author_email="info@opsdisk.com",
1111
description="A Python library for executing intelligent, realistic-looking, and tunable Google searches.",

yagooglesearch/__init__.py

Lines changed: 20 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212

1313
# Custom Python libraries.
1414

15-
__version__ = "1.1.0"
15+
__version__ = "1.2.0"
1616

1717
# Logging
1818
ROOT_LOGGER = logging.getLogger("yagooglesearch")
@@ -82,6 +82,7 @@ def __init__(
8282
http_429_cool_off_time_in_minutes=60,
8383
http_429_cool_off_factor=1.1,
8484
proxy="",
85+
verify_ssl=True,
8586
verbosity=5,
8687
):
8788

@@ -108,6 +109,8 @@ def __init__(
108109
:param float http_429_cool_off_factor: Factor to multiply by http_429_cool_off_time_in_minutes for each HTTP 429
109110
detected.
110111
:param str proxy: HTTP(S) or SOCKS5 proxy to use.
112+
:param bool verify_ssl: Verify the SSL certificate to prevent traffic interception attacks. Defaults to True.
113+
This may need to be disabled in some HTTPS proxy instances.
111114
:param int verbosity: Logging and console output verbosity.
112115
113116
:rtype: List of str
@@ -129,6 +132,7 @@ def __init__(
129132
self.http_429_cool_off_time_in_minutes = http_429_cool_off_time_in_minutes
130133
self.http_429_cool_off_factor = http_429_cool_off_factor
131134
self.proxy = proxy
135+
self.verify_ssl = verify_ssl
132136
self.verbosity = verbosity
133137

134138
# Assign log level.
@@ -176,6 +180,10 @@ def __init__(
176180
"https": self.proxy,
177181
}
178182

183+
# Suppress warning messages if verify_ssl is disabled.
184+
if not self.verify_ssl:
185+
requests.packages.urllib3.disable_warnings(requests.packages.urllib3.exceptions.InsecureRequestWarning)
186+
179187
def update_urls(self):
180188
"""Update search URLs being used."""
181189

@@ -234,8 +242,9 @@ def filter_search_result_urls(self, link):
234242
ROOT_LOGGER.debug(f"pre filter_search_result_urls() link: {link}")
235243

236244
try:
237-
# Extract URL from parameter.
238-
if link.startswith("/url?"):
245+
# Extract URL from parameter. Once in a while the full "http://www.google.com/url?" exists instead of just
246+
# "/url?". After a re-run, it disappears and "/url?" is present...might be a caching thing?
247+
if link.startswith("/url?") or link.startswith("http://www.google.com/url?"):
239248
urlparse_object = urllib.parse.urlparse(link, scheme="http")
240249

241250
# The "q" key exists most of the time.
@@ -294,7 +303,9 @@ def get_page(self, url):
294303
}
295304

296305
ROOT_LOGGER.info(f"Requesting URL: {url}")
297-
response = requests.get(url, proxies=self.proxy_dict, headers=headers, cookies=self.cookies, timeout=15)
306+
response = requests.get(
307+
url, proxies=self.proxy_dict, headers=headers, cookies=self.cookies, timeout=15, verify=self.verify_ssl
308+
)
298309

299310
# Update the cookies.
300311
self.cookies = response.cookies
@@ -303,10 +314,11 @@ def get_page(self, url):
303314
http_response_code = response.status_code
304315

305316
# debug_requests_response(response)
306-
ROOT_LOGGER.debug(f" status_code: {http_response_code}")
307-
ROOT_LOGGER.debug(f" proxy: {self.proxy}")
308-
ROOT_LOGGER.debug(f" headers: {headers}")
309-
ROOT_LOGGER.debug(f" cookies: {self.cookies}")
317+
ROOT_LOGGER.debug(f" status_code: {http_response_code}")
318+
ROOT_LOGGER.debug(f" headers: {headers}")
319+
ROOT_LOGGER.debug(f" cookies: {self.cookies}")
320+
ROOT_LOGGER.debug(f" proxy: {self.proxy}")
321+
ROOT_LOGGER.debug(f" verify_ssl: {self.verify_ssl}")
310322

311323
html = ""
312324

0 commit comments

Comments
 (0)