Skip to content

Commit 9f454eb

Browse files
author
Dan Lecocq
committed
Avoid additional string allocations when punycoding.
1 parent 87c3476 commit 9f454eb

File tree

4 files changed

+134
-66
lines changed

4 files changed

+134
-66
lines changed

include/punycode.h

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -54,8 +54,12 @@ namespace Url
5454

5555
// The highest codepoint in unicode
5656
const punycode_uint MAX_PUNYCODE_UINT = std::numeric_limits<punycode_uint>::max();
57-
//Utf8::MAX_CODEPOINT;
58-
//std::numeric_limits<punycode_uint>::max();
57+
58+
/**
59+
* Punycode the utf-8-encoded begin->end and append it to str.
60+
*/
61+
std::string& encode(std::string& str, std::string::const_iterator begin,
62+
std::string::const_iterator end);
5963

6064
/**
6165
* Replace utf-8-encoded str into punycode.
@@ -67,6 +71,12 @@ namespace Url
6771
*/
6872
std::string encode(const std::string& str);
6973

74+
/**
75+
* Append the utf-8-version of the punycoded string between begin and end to str.
76+
*/
77+
std::string& decode(std::string& str, std::string::const_iterator begin,
78+
std::string::const_iterator end);
79+
7080
/**
7181
* Replace punycoded str into utf-8-encoded.
7282
*/
@@ -82,6 +92,12 @@ namespace Url
8292
*/
8393
bool needsPunycoding(const std::string& str);
8494

95+
/**
96+
* Determine if the characters between these two iterators needs punycoding.
97+
*/
98+
bool needsPunycoding(const std::string::const_iterator& begin,
99+
const std::string::const_iterator& end);
100+
85101
/**
86102
* Internal function for calculating bias.
87103
*/

src/punycode.cpp

Lines changed: 49 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,15 @@ namespace Url
99
{
1010

1111
std::string& Punycode::encode(std::string& str)
12+
{
13+
std::string output;
14+
encode(output, str.cbegin(), str.cend());
15+
return str = output;
16+
}
17+
18+
std::string& Punycode::encode(std::string& output,
19+
std::string::const_iterator begin,
20+
std::string::const_iterator end)
1221
{
1322
// Pseudocode copied from https://tools.ietf.org/html/rfc3492#section-6.3
1423
//
@@ -18,25 +27,26 @@ namespace Url
1827
punycode_uint n = INITIAL_N;
1928
punycode_uint delta = 0;
2029
punycode_uint bias = INITIAL_BIAS;
21-
std::string output;
30+
31+
// let h = b = the number of basic code points in the input
32+
size_t h = 0;
33+
size_t b = 0;
2234

2335
// Accumulate the non-basic codepoints
2436
std::vector<punycode_uint> codepoints;
25-
for (auto it = str.cbegin(); it != str.cend(); )
37+
while (begin != end)
2638
{
27-
Utf8::codepoint_t value = Utf8::readCodepoint(it, str.cend());
39+
Utf8::codepoint_t value = Utf8::readCodepoint(begin, end);
2840
if (value < 0x80)
2941
{
3042
// copy them to the output in order
3143
output.append(1, static_cast<char>(value));
44+
++h;
45+
++b;
3246
}
3347
codepoints.push_back(value);
3448
}
3549

36-
// let h = b = the number of basic code points in the input
37-
size_t h = output.size();
38-
size_t b = h;
39-
4050
// copy a delimiter if b > 0
4151
if (b > 0)
4252
{
@@ -125,9 +135,8 @@ namespace Url
125135
++delta;
126136
++n;
127137
}
128-
129-
str.assign(output);
130-
return str;
138+
139+
return output;
131140
}
132141

133142
std::string Punycode::encode(const std::string& str)
@@ -137,7 +146,8 @@ namespace Url
137146
return result;
138147
}
139148

140-
std::string& Punycode::decode(std::string& str)
149+
std::string& Punycode::decode(std::string& str, std::string::const_iterator begin,
150+
std::string::const_iterator end)
141151
{
142152
// Pseudocode copied from https://tools.ietf.org/html/rfc3492#section-6.2
143153
//
@@ -150,15 +160,18 @@ namespace Url
150160
punycode_uint bias = INITIAL_BIAS;
151161
std::vector<punycode_uint> codepoints;
152162

153-
size_t index = str.rfind('-');
154-
if (index == std::string::npos)
163+
std::string::const_iterator index = end;
164+
for (; index != begin; --index)
155165
{
156-
index = 0;
166+
if (*index == '-')
167+
{
168+
break;
169+
}
157170
}
158171

159172
// consume all code points before the last delimiter (if there is one)
160173
// and copy them to output, fail on any non-basic code point
161-
for (auto it = str.begin(); it != (str.begin() + index); ++it)
174+
for (auto it = begin; it != index; ++it)
162175
{
163176
if (static_cast<unsigned char>(*it) > 127U)
164177
{
@@ -169,13 +182,13 @@ namespace Url
169182

170183
// if more than zero code points were consumed then consume one more
171184
// (which will be the last delimiter)
172-
if (index > 0)
185+
if (index != begin)
173186
{
174-
index += 1;
187+
++index;
175188
}
176189

177190
// while the input is not exhausted do begin
178-
for (auto it = (str.begin() + index); it != str.end(); ++it)
191+
for (auto it = index; it != end; ++it)
179192
{
180193
// let oldi = i
181194
// let w = 1
@@ -186,7 +199,7 @@ namespace Url
186199
for (punycode_uint k = BASE; ; k += BASE, ++it)
187200
{
188201
// consume a code point, or fail if there was none to consume
189-
if (it == str.end())
202+
if (it == end)
190203
{
191204
throw std::invalid_argument("Premature termination");
192205
}
@@ -275,16 +288,22 @@ namespace Url
275288
++i;
276289
}
277290

278-
std::string output;
279291
for (auto it = codepoints.begin(); it != codepoints.end(); ++it)
280292
{
281-
Utf8::writeCodepoint(output, *it);
293+
Utf8::writeCodepoint(str, *it);
282294
}
283-
str.assign(output);
284295

285296
return str;
286297
}
287298

299+
std::string& Punycode::decode(std::string& str)
300+
{
301+
std::string output;
302+
decode(output, str.cbegin(), str.cend());
303+
str.assign(output);
304+
return str;
305+
}
306+
288307
std::string Punycode::decode(const std::string& str)
289308
{
290309
std::string result(str);
@@ -293,10 +312,16 @@ namespace Url
293312
}
294313

295314
bool Punycode::needsPunycoding(const std::string& str)
315+
{
316+
return needsPunycoding(str.cbegin(), str.cend());
317+
}
318+
319+
bool Punycode::needsPunycoding(const std::string::const_iterator& begin,
320+
const std::string::const_iterator& end)
296321
{
297322
return std::any_of(
298-
str.begin(),
299-
str.end(),
323+
begin,
324+
end,
300325
[](char i){ return static_cast<unsigned char>(i) & 0x80; });
301326
}
302327

src/url.cpp

Lines changed: 55 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -709,33 +709,36 @@ namespace Url
709709

710710
std::string encoded;
711711

712-
size_t start = 0;
713-
size_t end = host_.find('.');
714-
while(true)
712+
auto last = host_.cbegin();
713+
for (auto it = host_.cbegin(); it != host_.cend(); ++it)
715714
{
716-
std::string segment = host_.substr(start, end - start);
717-
if (Punycode::needsPunycoding(segment))
715+
if (*it == '.')
718716
{
719-
encoded.append("xn--");
720-
encoded.append(Punycode::encode(segment));
721-
}
722-
else
723-
{
724-
encoded.append(segment);
725-
}
717+
if (Punycode::needsPunycoding(last, it))
718+
{
719+
encoded.append("xn--");
720+
Punycode::encode(encoded, last, it);
721+
}
722+
else
723+
{
724+
encoded.append(last, it);
725+
}
726726

727-
if (end == std::string::npos)
728-
{
729-
break;
730-
}
731-
else
732-
{
733727
encoded.append(1, '.');
734-
start = end + 1;
735-
end = host_.find('.', start);
728+
last = it + 1;
736729
}
737730
}
738731

732+
if (Punycode::needsPunycoding(last, host_.cend()))
733+
{
734+
encoded.append("xn--");
735+
Punycode::encode(encoded, last, host_.cend());
736+
}
737+
else
738+
{
739+
encoded.append(last, host_.cend());
740+
}
741+
739742
host_.assign(encoded);
740743

741744
return *this;
@@ -744,36 +747,48 @@ namespace Url
744747
Url& Url::unpunycode()
745748
{
746749
std::string unencoded;
750+
std::string prefix;
747751

748-
size_t start = 0;
749-
size_t end = host_.find('.');
750-
while(true)
752+
auto last = host_.cbegin();
753+
for (auto it = host_.cbegin(); it != host_.cend(); ++it)
751754
{
752-
std::string segment = host_.substr(start, end - start);
753-
if (segment.substr(0, 4).compare("xn--") == 0)
755+
if (*it == '.')
754756
{
755-
segment = segment.substr(4);
756-
unencoded.append(Punycode::decode(segment));
757-
}
758-
else
759-
{
760-
unencoded.append(segment);
761-
}
757+
// Starts with 'xn--'
758+
size_t distance = it - last;
759+
if (distance > 4)
760+
{
761+
prefix.assign(last, last + 4);
762+
if (prefix == "xn--")
763+
{
764+
Punycode::decode(unencoded, last + 4, it);
765+
unencoded.append(1, '.');
766+
last = it + 1;
767+
continue;
768+
}
769+
}
762770

763-
if (end == std::string::npos)
764-
{
765-
break;
771+
unencoded.append(last, it);
772+
unencoded.append(1, '.');
773+
last = it + 1;
766774
}
767-
else
775+
}
776+
777+
// Last segment
778+
size_t distance = host_.cend() - last;
779+
if (distance > 4)
780+
{
781+
prefix.assign(last, last + 4);
782+
if (prefix == "xn--")
768783
{
769-
unencoded.append(1, '.');
770-
start = end + 1;
771-
end = host_.find('.', start);
784+
Punycode::decode(unencoded, last + 4, host_.cend());
785+
host_.assign(unencoded);
786+
return *this;
772787
}
773788
}
774789

790+
unencoded.append(last, host_.cend());
775791
host_.assign(unencoded);
776-
777792
return *this;
778793
}
779794

test/test-url.cpp

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1108,6 +1108,18 @@ TEST(DefragTest, Defrag)
11081108
Url::Url("http://foo.com/path#fragment").defrag().str());
11091109
}
11101110

1111+
TEST(PunycodeTest, UnpunycodeShortIdentifierAtEnd)
1112+
{
1113+
std::string example("http://www.xn-/");
1114+
EXPECT_EQ(example, Url::Url(example).unpunycode().str());
1115+
}
1116+
1117+
TEST(PunycodeTest, UnpunycodeShortIdentifierAtStart)
1118+
{
1119+
std::string example("http://xn-.com/");
1120+
EXPECT_EQ(example, Url::Url(example).unpunycode().str());
1121+
}
1122+
11111123
TEST(PunycodeTest, German)
11121124
{
11131125
std::string unencoded("http://www.kündigen.de/");

0 commit comments

Comments
 (0)