Skip to content

Commit bd1d2d1

Browse files
xokdviumMic92
andcommitted
libutil: Use Boost.URL in parseURL
Boost.URL is a significantly more RFC-compliant parser than what libutil currently has a bundle of incomprehensible regexes. One aspect of this change is that RFC4007 ZoneId IPv6 literals are represented in URIs according to RFC6874 [1]. Previously they were represented naively like so: [fe80::818c:da4d:8975:415c\%enp0s25]. This is not entirely correct, because the percent itself has to be pct-encoded: > "%" is always treated as an escape character in a URI, so, according to the established URI syntax [RFC3986] any occurrences of literal "%" symbols in a URI MUST be percent-encoded and represented in the form "%25". Thus, the scoped address fe80::a%en1 would appear in a URI as http://[fe80::a%25en1]. [1]: https://datatracker.ietf.org/doc/html/rfc6874 Co-authored-by: Jörg Thalheim <joerg@thalheim.io>
1 parent d020f21 commit bd1d2d1

File tree

4 files changed

+87
-51
lines changed

4 files changed

+87
-51
lines changed

src/libutil-tests/url.cc

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -124,9 +124,9 @@ TEST(parseURL, parseIPv4Address)
124124
ASSERT_EQ(parsed, expected);
125125
}
126126

127-
TEST(parseURL, parseScopedRFC4007IPv6Address)
127+
TEST(parseURL, parseScopedRFC6874IPv6Address)
128128
{
129-
auto s = "http://[fe80::818c:da4d:8975:415c\%enp0s25]:8080";
129+
auto s = "http://[fe80::818c:da4d:8975:415c\%25enp0s25]:8080";
130130
auto parsed = parseURL(s);
131131

132132
ParsedURL expected{

src/libutil/include/nix/util/url-parts.hh

Lines changed: 0 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -8,21 +8,10 @@ namespace nix {
88

99
// URI stuff.
1010
const static std::string pctEncoded = "(?:%[0-9a-fA-F][0-9a-fA-F])";
11-
const static std::string schemeNameRegex = "(?:[a-z][a-z0-9+.-]*)";
12-
const static std::string ipv6AddressSegmentRegex = "[0-9a-fA-F:]+(?:%\\w+)?";
13-
const static std::string ipv6AddressRegex = "(?:\\[" + ipv6AddressSegmentRegex + "\\]|" + ipv6AddressSegmentRegex + ")";
1411
const static std::string unreservedRegex = "(?:[a-zA-Z0-9-._~])";
1512
const static std::string subdelimsRegex = "(?:[!$&'\"()*+,;=])";
16-
const static std::string hostnameRegex = "(?:(?:" + unreservedRegex + "|" + pctEncoded + "|" + subdelimsRegex + ")*)";
17-
const static std::string hostRegex = "(?:" + ipv6AddressRegex + "|" + hostnameRegex + ")";
18-
const static std::string userRegex = "(?:(?:" + unreservedRegex + "|" + pctEncoded + "|" + subdelimsRegex + "|:)*)";
19-
const static std::string authorityRegex = "(?:" + userRegex + "@)?" + hostRegex + "(?::[0-9]+)?";
2013
const static std::string pcharRegex = "(?:" + unreservedRegex + "|" + pctEncoded + "|" + subdelimsRegex + "|[:@])";
21-
const static std::string queryRegex = "(?:" + pcharRegex + "|[/? \"])*";
2214
const static std::string fragmentRegex = "(?:" + pcharRegex + "|[/? \"^])*";
23-
const static std::string segmentRegex = "(?:" + pcharRegex + "*)";
24-
const static std::string absPathRegex = "(?:(?:/" + segmentRegex + ")*/?)";
25-
const static std::string pathRegex = "(?:" + segmentRegex + "(?:/" + segmentRegex + ")*/?)";
2615

2716
/// A Git ref (i.e. branch or tag name).
2817
/// \todo check that this is correct.

src/libutil/include/nix/util/url.hh

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,17 @@ StringMap decodeQuery(const std::string & query);
3434

3535
std::string encodeQuery(const StringMap & query);
3636

37+
/**
38+
* Parse a Nix URL into a ParsedURL.
39+
*
40+
* Nix URI is mostly compliant with RFC3986, but with some deviations:
41+
* - Literal spaces are allowed and don't have to be percent encoded.
42+
* This is mostly done for backward compatibility.
43+
*
44+
* @note IPv6 ZoneId literals (RFC4007) are represented in URIs according to RFC6874.
45+
*
46+
* @throws BadURL
47+
*/
3748
ParsedURL parseURL(const std::string & url);
3849

3950
/**

src/libutil/url.cc

Lines changed: 74 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -12,40 +12,70 @@ std::regex refRegex(refRegexS, std::regex::ECMAScript);
1212
std::regex badGitRefRegex(badGitRefRegexS, std::regex::ECMAScript);
1313
std::regex revRegex(revRegexS, std::regex::ECMAScript);
1414

15-
ParsedURL parseURL(const std::string & url)
15+
/**
16+
* Drop trailing shevron for output installable syntax.
17+
*
18+
* FIXME: parseURL shouldn't really be used for parsing the OutputSpec, but it does
19+
* get used. That code should actually use ExtendedOutputsSpec::parseOpt.
20+
*/
21+
static std::string_view dropShevronSuffix(std::string_view url)
1622
{
17-
static std::regex uriRegex(
18-
"((" + schemeNameRegex + "):" + "(?:(?://(" + authorityRegex + ")(" + absPathRegex + "))|(/?" + pathRegex
19-
+ ")))" + "(?:\\?(" + queryRegex + "))?" + "(?:#(" + fragmentRegex + "))?",
20-
std::regex::ECMAScript);
21-
22-
std::smatch match;
23-
24-
if (std::regex_match(url, match, uriRegex)) {
25-
std::string scheme = match[2];
26-
auto authority = match[3].matched ? std::optional<std::string>(match[3]) : std::nullopt;
27-
std::string path = match[4].matched ? match[4] : match[5];
28-
auto & query = match[6];
29-
auto & fragment = match[7];
30-
31-
auto transportIsFile = parseUrlScheme(scheme).transport == "file";
32-
33-
if (authority && *authority != "" && transportIsFile)
34-
throw BadURL("file:// URL '%s' has unexpected authority '%s'", url, *authority);
35-
36-
if (transportIsFile && path.empty())
37-
path = "/";
38-
39-
return ParsedURL{
40-
.scheme = scheme,
41-
.authority = authority,
42-
.path = percentDecode(path),
43-
.query = decodeQuery(query),
44-
.fragment = percentDecode(std::string(fragment))};
45-
}
23+
auto shevron = url.rfind("^");
24+
if (shevron == std::string_view::npos)
25+
return url;
26+
return url.substr(0, shevron);
27+
}
28+
29+
/**
30+
* Percent encode spaces in the url.
31+
*/
32+
static std::string percentEncodeSpaces(std::string_view url)
33+
{
34+
return replaceStrings(std::string(url), " ", percentEncode(" "));
35+
}
4636

47-
else
48-
throw BadURL("'%s' is not a valid URL", url);
37+
ParsedURL parseURL(const std::string & url)
38+
try {
39+
/* Drop the shevron suffix used for the flakerefs. Shevron character is reserved and
40+
shouldn't appear in normal URIs. */
41+
auto unparsedView = dropShevronSuffix(url);
42+
/* For back-compat literal spaces are allowed. */
43+
auto withFixedSpaces = percentEncodeSpaces(unparsedView);
44+
auto urlView = boost::urls::url_view(withFixedSpaces);
45+
46+
if (!urlView.has_scheme())
47+
throw BadURL("'%s' doesn't have a scheme", url);
48+
49+
auto scheme = urlView.scheme();
50+
auto authority = [&]() -> std::optional<std::string> {
51+
if (urlView.has_authority())
52+
return percentDecode(urlView.authority().buffer());
53+
return std::nullopt;
54+
}();
55+
56+
auto transportIsFile = parseUrlScheme(scheme).transport == "file";
57+
if (authority && *authority != "" && transportIsFile)
58+
throw BadURL("file:// URL '%s' has unexpected authority '%s'", url, *authority);
59+
60+
auto path = urlView.path(); /* Does pct-decoding */
61+
auto fragment = urlView.fragment(); /* Does pct-decoding */
62+
63+
if (transportIsFile && path.empty())
64+
path = "/";
65+
66+
/* Get the raw query. Store URI supports smuggling doubly nested queries, where
67+
the inner &/? are pct-encoded. */
68+
auto query = std::string_view(urlView.encoded_query());
69+
70+
return ParsedURL{
71+
.scheme = scheme,
72+
.authority = authority,
73+
.path = path,
74+
.query = decodeQuery(std::string(query)),
75+
.fragment = fragment,
76+
};
77+
} catch (boost::system::system_error & e) {
78+
throw BadURL("'%s' is not a valid URL: %s", url, e.code().message());
4979
}
5080

5181
std::string percentDecode(std::string_view in)
@@ -64,20 +94,25 @@ std::string percentEncode(std::string_view s, std::string_view keep)
6494
}
6595

6696
StringMap decodeQuery(const std::string & query)
67-
{
97+
try {
98+
/* For back-compat literal spaces are allowed. */
99+
auto withFixedSpaces = percentEncodeSpaces(query);
100+
68101
StringMap result;
69102

70-
for (const auto & s : tokenizeString<Strings>(query, "&")) {
71-
auto e = s.find('=');
72-
if (e == std::string::npos) {
73-
warn("dubious URI query '%s' is missing equal sign '%s', ignoring", s, "=");
103+
auto encodedQuery = boost::urls::params_encoded_view(withFixedSpaces);
104+
for (auto && [key, value, value_specified] : encodedQuery) {
105+
if (!value_specified) {
106+
warn("dubious URI query '%s' is missing equal sign '%s', ignoring", std::string_view(key), "=");
74107
continue;
75108
}
76109

77-
result.emplace(s.substr(0, e), percentDecode(std::string_view(s).substr(e + 1)));
110+
result.emplace(key.decode(), value.decode());
78111
}
79112

80113
return result;
114+
} catch (boost::system::system_error & e) {
115+
throw BadURL("invalid URI query '%s': %s", query, e.code().message());
81116
}
82117

83118
const static std::string allowedInQuery = ":@/?";
@@ -150,6 +185,7 @@ std::string fixGitURL(const std::string & url)
150185
// https://www.rfc-editor.org/rfc/rfc3986#section-3.1
151186
bool isValidSchemeName(std::string_view s)
152187
{
188+
const static std::string schemeNameRegex = "(?:[a-z][a-z0-9+.-]*)";
153189
static std::regex regex(schemeNameRegex, std::regex::ECMAScript);
154190

155191
return std::regex_match(s.begin(), s.end(), regex, std::regex_constants::match_default);

0 commit comments

Comments
 (0)