blob: e7626761fcbca1c0308f0da52e8bbadc4704b4bb [file] [log] [blame]
// Copyright (c) 2017 Cloudflare, Inc. and contributors
// Licensed under the MIT License:
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#include "url.h"
#include <kj/encoding.h>
#include <kj/parse/char.h>
#include <kj/debug.h>
#include <stdlib.h>
namespace kj {
namespace {
constexpr auto ALPHAS = parse::charRange('a', 'z').orRange('A', 'Z');
constexpr auto DIGITS = parse::charRange('0', '9');
constexpr auto END_AUTHORITY = parse::anyOfChars("/?#");
// Authority, path, and query components can typically be terminated by the start of a fragment.
// However, fragments are disallowed in HTTP_REQUEST and HTTP_PROXY_REQUEST contexts. As a quirk, we
// allow the fragment start character ('#') to live unescaped in path and query components. We do
// not currently allow it in the authority component, because our parser would reject it as a host
// character anyway.
const parse::CharGroup_& getEndPathPart(Url::Context context) {
static constexpr auto END_PATH_PART_HREF = parse::anyOfChars("/?#");
static constexpr auto END_PATH_PART_REQUEST = parse::anyOfChars("/?");
switch (context) {
case Url::REMOTE_HREF: return END_PATH_PART_HREF;
case Url::HTTP_PROXY_REQUEST: return END_PATH_PART_REQUEST;
case Url::HTTP_REQUEST: return END_PATH_PART_REQUEST;
}
KJ_UNREACHABLE;
}
const parse::CharGroup_& getEndQueryPart(Url::Context context) {
static constexpr auto END_QUERY_PART_HREF = parse::anyOfChars("&#");
static constexpr auto END_QUERY_PART_REQUEST = parse::anyOfChars("&");
switch (context) {
case Url::REMOTE_HREF: return END_QUERY_PART_HREF;
case Url::HTTP_PROXY_REQUEST: return END_QUERY_PART_REQUEST;
case Url::HTTP_REQUEST: return END_QUERY_PART_REQUEST;
}
KJ_UNREACHABLE;
}
constexpr auto SCHEME_CHARS = ALPHAS.orGroup(DIGITS).orAny("+-.");
constexpr auto NOT_SCHEME_CHARS = SCHEME_CHARS.invert();
constexpr auto HOST_CHARS = ALPHAS.orGroup(DIGITS).orAny(".-:[]_");
// [] is for ipv6 literals.
// _ is not allowed in domain names, but the WHATWG URL spec allows it in hostnames, so we do, too.
// TODO(someday): The URL spec actually allows a lot more than just '_', and requires nameprepping
// to Punycode. We'll have to decide how we want to deal with all that.
void toLower(String& text) {
for (char& c: text) {
if ('A' <= c && c <= 'Z') {
c += 'a' - 'A';
}
}
}
Maybe<ArrayPtr<const char>> trySplit(StringPtr& text, char c) {
KJ_IF_MAYBE(pos, text.findFirst(c)) {
ArrayPtr<const char> result = text.slice(0, *pos);
text = text.slice(*pos + 1);
return result;
} else {
return nullptr;
}
}
Maybe<ArrayPtr<const char>> trySplit(ArrayPtr<const char>& text, char c) {
for (auto i: kj::indices(text)) {
if (text[i] == c) {
ArrayPtr<const char> result = text.slice(0, i);
text = text.slice(i + 1, text.size());
return result;
}
}
return nullptr;
}
ArrayPtr<const char> split(StringPtr& text, const parse::CharGroup_& chars) {
for (auto i: kj::indices(text)) {
if (chars.contains(text[i])) {
ArrayPtr<const char> result = text.slice(0, i);
text = text.slice(i);
return result;
}
}
auto result = text.asArray();
text = "";
return result;
}
String percentDecode(ArrayPtr<const char> text, bool& hadErrors, const Url::Options& options) {
if (options.percentDecode) {
auto result = decodeUriComponent(text);
if (result.hadErrors) hadErrors = true;
return kj::mv(result);
}
return kj::str(text);
}
String percentDecodeQuery(ArrayPtr<const char> text, bool& hadErrors, const Url::Options& options) {
if (options.percentDecode) {
auto result = decodeWwwForm(text);
if (result.hadErrors) hadErrors = true;
return kj::mv(result);
}
return kj::str(text);
}
} // namespace
Url::~Url() noexcept(false) {}
Url Url::clone() const {
return {
kj::str(scheme),
userInfo.map([](const UserInfo& ui) -> UserInfo {
return {
kj::str(ui.username),
ui.password.map([](const String& s) { return kj::str(s); })
};
}),
kj::str(host),
KJ_MAP(part, path) { return kj::str(part); },
hasTrailingSlash,
KJ_MAP(param, query) -> QueryParam {
// Preserve the "allocated-ness" of `param.value` with this careful copy.
return { kj::str(param.name), param.value.begin() == nullptr ? kj::String()
: kj::str(param.value) };
},
fragment.map([](const String& s) { return kj::str(s); }),
options
};
}
Url Url::parse(StringPtr url, Context context, Options options) {
return KJ_REQUIRE_NONNULL(tryParse(url, context, options), "invalid URL", url);
}
Maybe<Url> Url::tryParse(StringPtr text, Context context, Options options) {
Url result;
result.options = options;
bool err = false; // tracks percent-decoding errors
auto& END_PATH_PART = getEndPathPart(context);
auto& END_QUERY_PART = getEndQueryPart(context);
if (context == HTTP_REQUEST) {
if (!text.startsWith("/")) {
return nullptr;
}
} else {
KJ_IF_MAYBE(scheme, trySplit(text, ':')) {
result.scheme = kj::str(*scheme);
} else {
// missing scheme
return nullptr;
}
toLower(result.scheme);
if (result.scheme.size() == 0 ||
!ALPHAS.contains(result.scheme[0]) ||
!SCHEME_CHARS.containsAll(result.scheme.slice(1))) {
// bad scheme
return nullptr;
}
if (!text.startsWith("//")) {
// We require an authority (hostname) part.
return nullptr;
}
text = text.slice(2);
{
auto authority = split(text, END_AUTHORITY);
KJ_IF_MAYBE(userpass, trySplit(authority, '@')) {
if (context != REMOTE_HREF) {
// No user/pass allowed here.
return nullptr;
}
KJ_IF_MAYBE(username, trySplit(*userpass, ':')) {
result.userInfo = UserInfo {
percentDecode(*username, err, options),
percentDecode(*userpass, err, options)
};
} else {
result.userInfo = UserInfo {
percentDecode(*userpass, err, options),
nullptr
};
}
}
result.host = percentDecode(authority, err, options);
if (!HOST_CHARS.containsAll(result.host)) return nullptr;
toLower(result.host);
}
}
while (text.startsWith("/")) {
text = text.slice(1);
auto part = split(text, END_PATH_PART);
if (part.size() == 2 && part[0] == '.' && part[1] == '.') {
if (result.path.size() != 0) {
result.path.removeLast();
}
result.hasTrailingSlash = true;
} else if ((part.size() == 0 && (!options.allowEmpty || text.size() == 0)) ||
(part.size() == 1 && part[0] == '.')) {
// Collapse consecutive slashes and "/./".
result.hasTrailingSlash = true;
} else {
result.path.add(percentDecode(part, err, options));
result.hasTrailingSlash = false;
}
}
if (text.startsWith("?")) {
do {
text = text.slice(1);
auto part = split(text, END_QUERY_PART);
if (part.size() > 0 || options.allowEmpty) {
KJ_IF_MAYBE(key, trySplit(part, '=')) {
result.query.add(QueryParam { percentDecodeQuery(*key, err, options),
percentDecodeQuery(part, err, options) });
} else {
result.query.add(QueryParam { percentDecodeQuery(part, err, options), nullptr });
}
}
} while (text.startsWith("&"));
}
if (text.startsWith("#")) {
if (context != REMOTE_HREF) {
// No fragment allowed here.
return nullptr;
}
result.fragment = percentDecode(text.slice(1), err, options);
} else {
// We should have consumed everything.
KJ_ASSERT(text.size() == 0);
}
if (err) return nullptr;
return kj::mv(result);
}
Url Url::parseRelative(StringPtr url) const {
return KJ_REQUIRE_NONNULL(tryParseRelative(url), "invalid relative URL", url);
}
Maybe<Url> Url::tryParseRelative(StringPtr text) const {
if (text.size() == 0) return clone();
Url result;
result.options = options;
bool err = false; // tracks percent-decoding errors
auto& END_PATH_PART = getEndPathPart(Url::REMOTE_HREF);
auto& END_QUERY_PART = getEndQueryPart(Url::REMOTE_HREF);
// scheme
{
bool gotScheme = false;
for (auto i: kj::indices(text)) {
if (text[i] == ':') {
// found valid scheme
result.scheme = kj::str(text.slice(0, i));
text = text.slice(i + 1);
gotScheme = true;
break;
} else if (NOT_SCHEME_CHARS.contains(text[i])) {
// no scheme
break;
}
}
if (!gotScheme) {
// copy scheme
result.scheme = kj::str(this->scheme);
}
}
// authority
bool hadNewAuthority = text.startsWith("//");
if (hadNewAuthority) {
text = text.slice(2);
auto authority = split(text, END_AUTHORITY);
KJ_IF_MAYBE(userpass, trySplit(authority, '@')) {
KJ_IF_MAYBE(username, trySplit(*userpass, ':')) {
result.userInfo = UserInfo {
percentDecode(*username, err, options),
percentDecode(*userpass, err, options)
};
} else {
result.userInfo = UserInfo {
percentDecode(*userpass, err, options),
nullptr
};
}
}
result.host = percentDecode(authority, err, options);
if (!HOST_CHARS.containsAll(result.host)) return nullptr;
toLower(result.host);
} else {
// copy authority
result.host = kj::str(this->host);
result.userInfo = this->userInfo.map([](const UserInfo& userInfo) {
return UserInfo {
kj::str(userInfo.username),
userInfo.password.map([](const String& password) { return kj::str(password); }),
};
});
}
// path
bool hadNewPath = text.size() > 0 && text[0] != '?' && text[0] != '#';
if (hadNewPath) {
// There's a new path.
if (text[0] == '/') {
// New path is absolute, so don't copy the old path.
text = text.slice(1);
result.hasTrailingSlash = true;
} else if (this->path.size() > 0) {
// New path is relative, so start from the old path, dropping everything after the last
// slash.
auto slice = this->path.slice(0, this->path.size() - (this->hasTrailingSlash ? 0 : 1));
result.path = KJ_MAP(part, slice) { return kj::str(part); };
result.hasTrailingSlash = true;
}
for (;;) {
auto part = split(text, END_PATH_PART);
if (part.size() == 2 && part[0] == '.' && part[1] == '.') {
if (result.path.size() != 0) {
result.path.removeLast();
}
result.hasTrailingSlash = true;
} else if (part.size() == 0 || (part.size() == 1 && part[0] == '.')) {
// Collapse consecutive slashes and "/./".
result.hasTrailingSlash = true;
} else {
result.path.add(percentDecode(part, err, options));
result.hasTrailingSlash = false;
}
if (!text.startsWith("/")) break;
text = text.slice(1);
}
} else if (!hadNewAuthority) {
// copy path
result.path = KJ_MAP(part, this->path) { return kj::str(part); };
result.hasTrailingSlash = this->hasTrailingSlash;
}
if (text.startsWith("?")) {
do {
text = text.slice(1);
auto part = split(text, END_QUERY_PART);
if (part.size() > 0) {
KJ_IF_MAYBE(key, trySplit(part, '=')) {
result.query.add(QueryParam { percentDecodeQuery(*key, err, options),
percentDecodeQuery(part, err, options) });
} else {
result.query.add(QueryParam { percentDecodeQuery(part, err, options),
nullptr });
}
}
} while (text.startsWith("&"));
} else if (!hadNewAuthority && !hadNewPath) {
// copy query
result.query = KJ_MAP(param, this->query) -> QueryParam {
// Preserve the "allocated-ness" of `param.value` with this careful copy.
return { kj::str(param.name), param.value.begin() == nullptr ? kj::String()
: kj::str(param.value) };
};
}
if (text.startsWith("#")) {
result.fragment = percentDecode(text.slice(1), err, options);
} else {
// We should have consumed everything.
KJ_ASSERT(text.size() == 0);
}
if (err) return nullptr;
return kj::mv(result);
}
String Url::toString(Context context) const {
Vector<char> chars(128);
if (context != HTTP_REQUEST) {
chars.addAll(scheme);
chars.addAll(StringPtr("://"));
if (context == REMOTE_HREF) {
KJ_IF_MAYBE(user, userInfo) {
chars.addAll(options.percentDecode ? encodeUriUserInfo(user->username)
: kj::str(user->username));
KJ_IF_MAYBE(pass, user->password) {
chars.add(':');
chars.addAll(options.percentDecode ? encodeUriUserInfo(*pass) : kj::str(*pass));
}
chars.add('@');
}
}
// RFC3986 specifies that hosts can contain percent-encoding escapes while suggesting that
// they should only be used for UTF-8 sequences. However, the DNS standard specifies a
// different way to encode Unicode into domain names and doesn't permit any characters which
// would need to be escaped. Meanwhile, encodeUriComponent() here would incorrectly try to
// escape colons and brackets (e.g. around ipv6 literal addresses). So, instead, we throw if
// the host is invalid.
if (HOST_CHARS.containsAll(host)) {
chars.addAll(host);
} else {
KJ_FAIL_REQUIRE("invalid hostname when stringifying URL", host) {
chars.addAll(StringPtr("invalid-host"));
break;
}
}
}
for (auto& pathPart: path) {
// Protect against path injection.
KJ_REQUIRE((pathPart != "" || options.allowEmpty) && pathPart != "." && pathPart != "..",
"invalid name in URL path", path) {
continue;
}
chars.add('/');
chars.addAll(options.percentDecode ? encodeUriPath(pathPart) : kj::str(pathPart));
}
if (hasTrailingSlash || (path.size() == 0 && context == HTTP_REQUEST)) {
chars.add('/');
}
bool first = true;
for (auto& param: query) {
chars.add(first ? '?' : '&');
first = false;
chars.addAll(options.percentDecode ? encodeWwwForm(param.name) : kj::str(param.name));
if (param.value.begin() != nullptr) {
chars.add('=');
chars.addAll(options.percentDecode ? encodeWwwForm(param.value) : kj::str(param.value));
}
}
if (context == REMOTE_HREF) {
KJ_IF_MAYBE(f, fragment) {
chars.add('#');
chars.addAll(options.percentDecode ? encodeUriFragment(*f) : kj::str(*f));
}
}
chars.add('\0');
return String(chars.releaseAsArray());
}
} // namespace kj