Skip to content

Commit 36b8ffa

Browse files
committed
Added result langauge support and removed URL encoding of extra_params
1 parent 06a8a4b commit 36b8ffa

File tree

1 file changed

+33
-12
lines changed

1 file changed

+33
-12
lines changed

yagooglesearch/__init__.py

Lines changed: 33 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212

1313
# Custom Python libraries.
1414

15-
__version__ = "1.7.0"
15+
__version__ = "1.8.0"
1616

1717
# Logging
1818
ROOT_LOGGER = logging.getLogger("yagooglesearch")
@@ -38,13 +38,25 @@
3838

3939
try:
4040
user_agents_file = os.path.join(install_folder, "user_agents.txt")
41-
with open(user_agents_file) as fh:
41+
with open(user_agents_file, "r") as fh:
4242
user_agents_list = [_.strip() for _ in fh.readlines()]
4343

4444
except Exception:
4545
user_agents_list = [USER_AGENT]
4646

4747

48+
# Load the list of result languages. Compiled by viewing the source code at https://www.google.com/advanced_search for
49+
# the supported languages.
50+
try:
51+
result_languages_file = os.path.join(install_folder, "result_languages.txt")
52+
with open(result_languages_file, "r") as fh:
53+
result_languages_list = [_.strip().split("=")[0] for _ in fh.readlines()]
54+
55+
except Exception as e:
56+
print(f"There was an issue loading the result languages file. Exception: {e}")
57+
result_languages_list = []
58+
59+
4860
def get_tbs(from_date, to_date):
4961
"""Helper function to format the tbs parameter dates. Note that verbatim mode also uses the &tbs= parameter, but
5062
this function is just for customized search periods.
@@ -69,7 +81,8 @@ def __init__(
6981
self,
7082
query,
7183
tld="com",
72-
lang="en",
84+
lang_html_ui="en",
85+
lang_result="lang_en",
7386
tbs="0",
7487
safe="off",
7588
start=0,
@@ -92,7 +105,8 @@ def __init__(
92105
SearchClient
93106
:param str query: Query string. Must NOT be url-encoded.
94107
:param str tld: Top level domain.
95-
:param str lang: Language.
108+
:param str lang_html_ui: HTML User Interface language.
109+
:param str lang_result: Search result language.
96110
:param str tbs: Verbatim search or time limits (e.g., "qdr:h" => last hour, "qdr:d" => last 24 hours, "qdr:m"
97111
=> last month).
98112
:param str safe: Safe search.
@@ -127,7 +141,8 @@ def __init__(
127141

128142
self.query = urllib.parse.quote_plus(query)
129143
self.tld = tld
130-
self.lang = lang
144+
self.lang_html_ui = lang_html_ui
145+
self.lang_result = lang_result.lower()
131146
self.tbs = tbs
132147
self.safe = safe
133148
self.start = start
@@ -150,6 +165,13 @@ def __init__(
150165
ROOT_LOGGER.setLevel((6 - self.verbosity) * 10)
151166

152167
# Argument checks.
168+
if self.lang_result not in result_languages_list:
169+
ROOT_LOGGER.error(
170+
f"{self.lang_result} is not a valid language result. See {result_languages_file} for the list of valid "
171+
'languages. Setting lang_result to "lang_en".'
172+
)
173+
self.lang_result = "lang_en"
174+
153175
if self.num > 100:
154176
ROOT_LOGGER.warning("The largest value allowed by Google for num is 100. Setting num to 100.")
155177
self.num = 100
@@ -171,6 +193,7 @@ def __init__(
171193
"safe",
172194
"start",
173195
"tbs",
196+
"lr",
174197
)
175198

176199
# Default user agent, unless instructed by the user to change it.
@@ -215,28 +238,28 @@ def update_urls(self):
215238

216239
# First search requesting the default 10 search results.
217240
self.url_search = (
218-
f"https://www.google.{self.tld}/search?hl={self.lang}&"
241+
f"https://www.google.{self.tld}/search?hl={self.lang_html_ui}&lr={self.lang_result}&"
219242
f"q={self.query}&btnG=Google+Search&tbs={self.tbs}&safe={self.safe}&"
220243
f"cr={self.country}&filter=0"
221244
)
222245

223246
# Subsequent searches starting at &start= and retrieving 10 search results at a time.
224247
self.url_next_page = (
225-
f"https://www.google.{self.tld}/search?hl={self.lang}&"
248+
f"https://www.google.{self.tld}/search?hl={self.lang_html_ui}&lr={self.lang_result}&"
226249
f"q={self.query}&start={self.start}&tbs={self.tbs}&safe={self.safe}&"
227250
f"cr={self.country}&filter=0"
228251
)
229252

230253
# First search requesting more than the default 10 search results.
231254
self.url_search_num = (
232-
f"https://www.google.{self.tld}/search?hl={self.lang}&"
255+
f"https://www.google.{self.tld}/search?hl={self.lang_html_ui}&lr={self.lang_result}&"
233256
f"q={self.query}&num={self.num}&btnG=Google+Search&tbs={self.tbs}&"
234257
f"safe={self.safe}&cr={self.country}&filter=0"
235258
)
236259

237260
# Subsequent searches starting at &start= and retrieving &num= search results at a time.
238261
self.url_next_page_num = (
239-
f"https://www.google.{self.tld}/search?hl={self.lang}&"
262+
f"https://www.google.{self.tld}/search?hl={self.lang_html_ui}&lr={self.lang_result}&"
240263
f"q={self.query}&start={self.start}&num={self.num}&tbs={self.tbs}&"
241264
f"safe={self.safe}&cr={self.country}&filter=0"
242265
)
@@ -458,10 +481,8 @@ def search(self):
458481
url = self.url_search_num
459482

460483
# Append extra GET parameters to the URL. This is done on every iteration because we're rebuilding the
461-
# entire URL at the end of this loop.
484+
# entire URL at the end of this loop. The keys and values are not URL encoded.
462485
for key, value in self.extra_params.items():
463-
key = urllib.parse.quote_plus(key)
464-
value = urllib.parse.quote_plus(value)
465486
url += f"&{key}={value}"
466487

467488
# Request Google search results.

0 commit comments

Comments
 (0)