Skip to content

Commit b591965

Browse files
committed
libutil: Use Boost.URL in parseURL
Boost.URL is a significantly more RFC-compliant parser than what libutil currently has a bundle of incomprehensible regexes. One aspect of this change is that RFC4007 ZoneId IPv6 literals are represented in URIs according to RFC6874 [1]. Previously they were represented naively like so: [fe80::818c:da4d:8975:415c\%enp0s25]. This is not entirely correct, because the percent itself has to be pct-encoded: > "%" is always treated as an escape character in a URI, so, according to the established URI syntax [RFC3986] any occurrences of literal "%" symbols in a URI MUST be percent-encoded and represented in the form "%25". Thus, the scoped address fe80::a%en1 would appear in a URI as http://[fe80::a%25en1]. [1]: https://datatracker.ietf.org/doc/html/rfc6874
1 parent d79e714 commit b591965

File tree

4 files changed

+85
-58
lines changed

4 files changed

+85
-58
lines changed

src/libutil-tests/url.cc

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -117,8 +117,8 @@ namespace nix {
117117
ASSERT_EQ(parsed, expected);
118118
}
119119

120-
TEST(parseURL, parseScopedRFC4007IPv6Address) {
121-
auto s = "http://[fe80::818c:da4d:8975:415c\%enp0s25]:8080";
120+
TEST(parseURL, parseScopedRFC6874IPv6Address) {
121+
auto s = "http://[fe80::818c:da4d:8975:415c\%25enp0s25]:8080";
122122
auto parsed = parseURL(s);
123123

124124
ParsedURL expected {

src/libutil/include/nix/util/url-parts.hh

Lines changed: 0 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -8,21 +8,10 @@ namespace nix {
88

99
// URI stuff.
1010
const static std::string pctEncoded = "(?:%[0-9a-fA-F][0-9a-fA-F])";
11-
const static std::string schemeNameRegex = "(?:[a-z][a-z0-9+.-]*)";
12-
const static std::string ipv6AddressSegmentRegex = "[0-9a-fA-F:]+(?:%\\w+)?";
13-
const static std::string ipv6AddressRegex = "(?:\\[" + ipv6AddressSegmentRegex + "\\]|" + ipv6AddressSegmentRegex + ")";
1411
const static std::string unreservedRegex = "(?:[a-zA-Z0-9-._~])";
1512
const static std::string subdelimsRegex = "(?:[!$&'\"()*+,;=])";
16-
const static std::string hostnameRegex = "(?:(?:" + unreservedRegex + "|" + pctEncoded + "|" + subdelimsRegex + ")*)";
17-
const static std::string hostRegex = "(?:" + ipv6AddressRegex + "|" + hostnameRegex + ")";
18-
const static std::string userRegex = "(?:(?:" + unreservedRegex + "|" + pctEncoded + "|" + subdelimsRegex + "|:)*)";
19-
const static std::string authorityRegex = "(?:" + userRegex + "@)?" + hostRegex + "(?::[0-9]+)?";
2013
const static std::string pcharRegex = "(?:" + unreservedRegex + "|" + pctEncoded + "|" + subdelimsRegex + "|[:@])";
21-
const static std::string queryRegex = "(?:" + pcharRegex + "|[/? \"])*";
2214
const static std::string fragmentRegex = "(?:" + pcharRegex + "|[/? \"^])*";
23-
const static std::string segmentRegex = "(?:" + pcharRegex + "*)";
24-
const static std::string absPathRegex = "(?:(?:/" + segmentRegex + ")*/?)";
25-
const static std::string pathRegex = "(?:" + segmentRegex + "(?:/" + segmentRegex + ")*/?)";
2615

2716
/// A Git ref (i.e. branch or tag name).
2817
/// \todo check that this is correct.

src/libutil/include/nix/util/url.hh

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,17 @@ StringMap decodeQuery(const std::string & query);
3434

3535
std::string encodeQuery(const StringMap & query);
3636

37+
/**
38+
* Parse a Nix URL into a ParsedURL.
39+
*
40+
* Nix URI is mostly compliant with RFC3986, but with some deviations:
41+
* - Literal spaces are allowed and don't have to be percent encoded.
42+
* This is mostly done for backward compatibility.
43+
*
44+
* @note IPv6 ZoneId literals (RFC4007) are represented in URIs according to RFC6874.
45+
*
46+
* @throws BadURL
47+
*/
3748
ParsedURL parseURL(const std::string & url);
3849

3950
/**

src/libutil/url.cc

Lines changed: 72 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -12,45 +12,68 @@ std::regex refRegex(refRegexS, std::regex::ECMAScript);
1212
std::regex badGitRefRegex(badGitRefRegexS, std::regex::ECMAScript);
1313
std::regex revRegex(revRegexS, std::regex::ECMAScript);
1414

15-
ParsedURL parseURL(const std::string & url)
15+
/**
16+
* Drop trailing shevron for output installable syntax.
17+
*
18+
* FIXME: parseURL shouldn't really be used for parsing the OutputSpec, but it does
19+
* get used. That code should actually use ExtendedOutputsSpec::parseOpt.
20+
*/
21+
static std::string_view dropShevronSuffix(std::string_view url)
1622
{
17-
static std::regex uriRegex(
18-
"((" + schemeNameRegex + "):"
19-
+ "(?:(?://(" + authorityRegex + ")(" + absPathRegex + "))|(/?" + pathRegex + ")))"
20-
+ "(?:\\?(" + queryRegex + "))?"
21-
+ "(?:#(" + fragmentRegex + "))?",
22-
std::regex::ECMAScript);
23-
24-
std::smatch match;
25-
26-
if (std::regex_match(url, match, uriRegex)) {
27-
std::string scheme = match[2];
28-
auto authority = match[3].matched
29-
? std::optional<std::string>(match[3]) : std::nullopt;
30-
std::string path = match[4].matched ? match[4] : match[5];
31-
auto & query = match[6];
32-
auto & fragment = match[7];
33-
34-
auto transportIsFile = parseUrlScheme(scheme).transport == "file";
35-
36-
if (authority && *authority != "" && transportIsFile)
37-
throw BadURL("file:// URL '%s' has unexpected authority '%s'",
38-
url, *authority);
39-
40-
if (transportIsFile && path.empty())
41-
path = "/";
42-
43-
return ParsedURL{
44-
.scheme = scheme,
45-
.authority = authority,
46-
.path = percentDecode(path),
47-
.query = decodeQuery(query),
48-
.fragment = percentDecode(std::string(fragment))
49-
};
50-
}
23+
auto shevron = url.rfind("^");
24+
return url.substr(0, shevron);
25+
}
5126

52-
else
53-
throw BadURL("'%s' is not a valid URL", url);
27+
/**
28+
* Percent encode spaces in the url.
29+
*/
30+
static std::string percentEncodeSpaces(std::string_view url)
31+
{
32+
return replaceStrings(std::string(url), " ", percentEncode(" "));
33+
}
34+
35+
ParsedURL parseURL(const std::string & url)
36+
try {
37+
/* Drop the shevron suffix used for the flakerefs. Shevron character is reserved and
38+
shouldn't appear in normal URIs. */
39+
auto unparsedView = dropShevronSuffix(url);
40+
/* For back-compat literal spaces are allowed. */
41+
auto withFixedSpaces = percentEncodeSpaces(unparsedView);
42+
auto urlView = boost::urls::url_view(withFixedSpaces);
43+
44+
if (!urlView.has_scheme())
45+
throw BadURL("'%s' doesn't have a scheme", url);
46+
47+
auto scheme = urlView.scheme();
48+
auto authority = [&]() -> std::optional<std::string> {
49+
if (urlView.has_authority())
50+
return percentDecode(urlView.authority().buffer());
51+
return std::nullopt;
52+
}();
53+
54+
auto transportIsFile = parseUrlScheme(scheme).transport == "file";
55+
if (authority && *authority != "" && transportIsFile)
56+
throw BadURL("file:// URL '%s' has unexpected authority '%s'", url, *authority);
57+
58+
auto path = urlView.path(); /* Does pct-decoding */
59+
auto fragment = urlView.fragment(); /* Does pct-decoding */
60+
61+
if (transportIsFile && path.empty())
62+
path = "/";
63+
64+
/* Get the raw query. Store URI supports smuggling doubly nested queries, where
65+
the innder &/? are pct-encoded. */
66+
auto query = std::string_view(urlView.encoded_query());
67+
68+
return ParsedURL{
69+
.scheme = scheme,
70+
.authority = authority,
71+
.path = path,
72+
.query = decodeQuery(std::string(query)),
73+
.fragment = fragment,
74+
};
75+
} catch (boost::system::system_error & e) {
76+
throw BadURL("'%s' is not a valid URL: %s", url, e.code().message());
5477
}
5578

5679
std::string percentDecode(std::string_view in)
@@ -69,22 +92,25 @@ std::string percentEncode(std::string_view s, std::string_view keep)
6992
}
7093

7194
StringMap decodeQuery(const std::string & query)
72-
{
95+
try {
96+
/* For back-compat literal spaces are allowed. */
97+
auto withFixedSpaces = percentEncodeSpaces(query);
98+
7399
StringMap result;
74100

75-
for (const auto & s : tokenizeString<Strings>(query, "&")) {
76-
auto e = s.find('=');
77-
if (e == std::string::npos) {
78-
warn("dubious URI query '%s' is missing equal sign '%s', ignoring", s, "=");
101+
auto encodedQuery = boost::urls::params_encoded_view(withFixedSpaces);
102+
for (auto && [key, value, value_specified] : encodedQuery) {
103+
if (!value_specified) {
104+
warn("dubious URI query '%s' is missing equal sign '%s', ignoring", std::string_view(key), "=");
79105
continue;
80106
}
81107

82-
result.emplace(
83-
s.substr(0, e),
84-
percentDecode(std::string_view(s).substr(e + 1)));
108+
result.emplace(key.decode(), value.decode());
85109
}
86110

87111
return result;
112+
} catch (boost::system::system_error & e) {
113+
throw BadURL("invalid URI query '%s'", e.code().message());
88114
}
89115

90116
const static std::string allowedInQuery = ":@/?";
@@ -165,6 +191,7 @@ std::string fixGitURL(const std::string & url)
165191
// https://www.rfc-editor.org/rfc/rfc3986#section-3.1
166192
bool isValidSchemeName(std::string_view s)
167193
{
194+
const static std::string schemeNameRegex = "(?:[a-z][a-z0-9+.-]*)";
168195
static std::regex regex(schemeNameRegex, std::regex::ECMAScript);
169196

170197
return std::regex_match(s.begin(), s.end(), regex, std::regex_constants::match_default);

0 commit comments

Comments
 (0)