Skip to content

Commit 8f10f02

Browse files
author
Dan Lecocq
committed
Avoid additional string allocations when punycoding.
1 parent 0fac174 commit 8f10f02

File tree

4 files changed

+135
-66
lines changed

4 files changed

+135
-66
lines changed

include/punycode.h

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -54,8 +54,12 @@ namespace Url
5454

5555
// The highest codepoint in unicode
5656
const punycode_uint MAX_PUNYCODE_UINT = std::numeric_limits<punycode_uint>::max();
57-
//Utf8::MAX_CODEPOINT;
58-
//std::numeric_limits<punycode_uint>::max();
57+
58+
/**
59+
* Punycode the utf-8-encoded begin->end and append it to str.
60+
*/
61+
std::string& encode(std::string& str, std::string::const_iterator begin,
62+
std::string::const_iterator end);
5963

6064
/**
6165
* Replace utf-8-encoded str into punycode.
@@ -67,6 +71,12 @@ namespace Url
6771
*/
6872
std::string encode(const std::string& str);
6973

74+
/**
75+
* Append the utf-8-version of the punycoded string between begin and end to str.
76+
*/
77+
std::string& decode(std::string& str, std::string::const_iterator begin,
78+
std::string::const_iterator end);
79+
7080
/**
7181
* Replace punycoded str into utf-8-encoded.
7282
*/
@@ -82,6 +92,12 @@ namespace Url
8292
*/
8393
bool needsPunycoding(const std::string& str);
8494

95+
/**
96+
* Determine if the characters between these two iterators needs punycoding.
97+
*/
98+
bool needsPunycoding(const std::string::const_iterator& begin,
99+
const std::string::const_iterator& end);
100+
85101
/**
86102
* Internal function for calculating bias.
87103
*/

src/punycode.cpp

Lines changed: 50 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,16 @@ namespace Url
99
{
1010

1111
std::string& Punycode::encode(std::string& str)
12+
{
13+
std::string output;
14+
encode(output, str.cbegin(), str.cend());
15+
str.assign(output);
16+
return str;
17+
}
18+
19+
std::string& Punycode::encode(std::string& output,
20+
std::string::const_iterator begin,
21+
std::string::const_iterator end)
1222
{
1323
// Pseudocode copied from https://tools.ietf.org/html/rfc3492#section-6.3
1424
//
@@ -18,25 +28,26 @@ namespace Url
1828
punycode_uint n = INITIAL_N;
1929
punycode_uint delta = 0;
2030
punycode_uint bias = INITIAL_BIAS;
21-
std::string output;
31+
32+
// let h = b = the number of basic code points in the input
33+
size_t h = 0;
34+
size_t b = 0;
2235

2336
// Accumulate the non-basic codepoints
2437
std::vector<punycode_uint> codepoints;
25-
for (auto it = str.cbegin(); it != str.cend(); )
38+
while (begin != end)
2639
{
27-
Utf8::codepoint_t value = Utf8::readCodepoint(it, str.cend());
40+
Utf8::codepoint_t value = Utf8::readCodepoint(begin, end);
2841
if (value < 0x80)
2942
{
3043
// copy them to the output in order
3144
output.append(1, static_cast<char>(value));
45+
++h;
46+
++b;
3247
}
3348
codepoints.push_back(value);
3449
}
3550

36-
// let h = b = the number of basic code points in the input
37-
size_t h = output.size();
38-
size_t b = h;
39-
4051
// copy a delimiter if b > 0
4152
if (b > 0)
4253
{
@@ -125,9 +136,8 @@ namespace Url
125136
++delta;
126137
++n;
127138
}
128-
129-
str.assign(output);
130-
return str;
139+
140+
return output;
131141
}
132142

133143
std::string Punycode::encode(const std::string& str)
@@ -137,7 +147,8 @@ namespace Url
137147
return result;
138148
}
139149

140-
std::string& Punycode::decode(std::string& str)
150+
std::string& Punycode::decode(std::string& str, std::string::const_iterator begin,
151+
std::string::const_iterator end)
141152
{
142153
// Pseudocode copied from https://tools.ietf.org/html/rfc3492#section-6.2
143154
//
@@ -150,15 +161,18 @@ namespace Url
150161
punycode_uint bias = INITIAL_BIAS;
151162
std::vector<punycode_uint> codepoints;
152163

153-
size_t index = str.rfind('-');
154-
if (index == std::string::npos)
164+
std::string::const_iterator index = end;
165+
for (; index != begin; --index)
155166
{
156-
index = 0;
167+
if (*index == '-')
168+
{
169+
break;
170+
}
157171
}
158172

159173
// consume all code points before the last delimiter (if there is one)
160174
// and copy them to output, fail on any non-basic code point
161-
for (auto it = str.begin(); it != (str.begin() + index); ++it)
175+
for (auto it = begin; it != index; ++it)
162176
{
163177
if (static_cast<unsigned char>(*it) > 127U)
164178
{
@@ -169,13 +183,13 @@ namespace Url
169183

170184
// if more than zero code points were consumed then consume one more
171185
// (which will be the last delimiter)
172-
if (index > 0)
186+
if (index != begin)
173187
{
174-
index += 1;
188+
++index;
175189
}
176190

177191
// while the input is not exhausted do begin
178-
for (auto it = (str.begin() + index); it != str.end(); ++it)
192+
for (auto it = index; it != end; ++it)
179193
{
180194
// let oldi = i
181195
// let w = 1
@@ -186,7 +200,7 @@ namespace Url
186200
for (punycode_uint k = BASE; ; k += BASE, ++it)
187201
{
188202
// consume a code point, or fail if there was none to consume
189-
if (it == str.end())
203+
if (it == end)
190204
{
191205
throw std::invalid_argument("Premature termination");
192206
}
@@ -275,16 +289,22 @@ namespace Url
275289
++i;
276290
}
277291

278-
std::string output;
279292
for (auto it = codepoints.begin(); it != codepoints.end(); ++it)
280293
{
281-
Utf8::writeCodepoint(output, *it);
294+
Utf8::writeCodepoint(str, *it);
282295
}
283-
str.assign(output);
284296

285297
return str;
286298
}
287299

300+
std::string& Punycode::decode(std::string& str)
301+
{
302+
std::string output;
303+
decode(output, str.cbegin(), str.cend());
304+
str.assign(output);
305+
return str;
306+
}
307+
288308
std::string Punycode::decode(const std::string& str)
289309
{
290310
std::string result(str);
@@ -293,10 +313,16 @@ namespace Url
293313
}
294314

295315
bool Punycode::needsPunycoding(const std::string& str)
316+
{
317+
return needsPunycoding(str.cbegin(), str.cend());
318+
}
319+
320+
bool Punycode::needsPunycoding(const std::string::const_iterator& begin,
321+
const std::string::const_iterator& end)
296322
{
297323
return std::any_of(
298-
str.begin(),
299-
str.end(),
324+
begin,
325+
end,
300326
[](char i){ return static_cast<unsigned char>(i) & 0x80; });
301327
}
302328

src/url.cpp

Lines changed: 55 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -663,33 +663,36 @@ namespace Url
663663

664664
std::string encoded;
665665

666-
size_t start = 0;
667-
size_t end = host_.find('.');
668-
while(true)
666+
auto last = host_.cbegin();
667+
for (auto it = host_.cbegin(); it != host_.cend(); ++it)
669668
{
670-
std::string segment = host_.substr(start, end - start);
671-
if (Punycode::needsPunycoding(segment))
669+
if (*it == '.')
672670
{
673-
encoded.append("xn--");
674-
encoded.append(Punycode::encode(segment));
675-
}
676-
else
677-
{
678-
encoded.append(segment);
679-
}
671+
if (Punycode::needsPunycoding(last, it))
672+
{
673+
encoded.append("xn--");
674+
Punycode::encode(encoded, last, it);
675+
}
676+
else
677+
{
678+
encoded.append(last, it);
679+
}
680680

681-
if (end == std::string::npos)
682-
{
683-
break;
684-
}
685-
else
686-
{
687681
encoded.append(1, '.');
688-
start = end + 1;
689-
end = host_.find('.', start);
682+
last = it + 1;
690683
}
691684
}
692685

686+
if (Punycode::needsPunycoding(last, host_.cend()))
687+
{
688+
encoded.append("xn--");
689+
Punycode::encode(encoded, last, host_.cend());
690+
}
691+
else
692+
{
693+
encoded.append(last, host_.cend());
694+
}
695+
693696
host_.assign(encoded);
694697

695698
return *this;
@@ -698,36 +701,48 @@ namespace Url
698701
Url& Url::unpunycode()
699702
{
700703
std::string unencoded;
704+
std::string prefix;
701705

702-
size_t start = 0;
703-
size_t end = host_.find('.');
704-
while(true)
706+
auto last = host_.cbegin();
707+
for (auto it = host_.cbegin(); it != host_.cend(); ++it)
705708
{
706-
std::string segment = host_.substr(start, end - start);
707-
if (segment.substr(0, 4).compare("xn--") == 0)
709+
if (*it == '.')
708710
{
709-
segment = segment.substr(4);
710-
unencoded.append(Punycode::decode(segment));
711-
}
712-
else
713-
{
714-
unencoded.append(segment);
715-
}
711+
// Starts with 'xn--'
712+
size_t distance = it - last;
713+
if (distance > 4)
714+
{
715+
prefix.assign(last, last + 4);
716+
if (prefix == "xn--")
717+
{
718+
Punycode::decode(unencoded, last + 4, it);
719+
unencoded.append(1, '.');
720+
last = it + 1;
721+
continue;
722+
}
723+
}
716724

717-
if (end == std::string::npos)
718-
{
719-
break;
725+
unencoded.append(last, it);
726+
unencoded.append(1, '.');
727+
last = it + 1;
720728
}
721-
else
729+
}
730+
731+
// Last segment
732+
size_t distance = host_.cend() - last;
733+
if (distance > 4)
734+
{
735+
prefix.assign(last, last + 4);
736+
if (prefix == "xn--")
722737
{
723-
unencoded.append(1, '.');
724-
start = end + 1;
725-
end = host_.find('.', start);
738+
Punycode::decode(unencoded, last + 4, host_.cend());
739+
host_.assign(unencoded);
740+
return *this;
726741
}
727742
}
728743

744+
unencoded.append(last, host_.cend());
729745
host_.assign(unencoded);
730-
731746
return *this;
732747
}
733748

test/test-url.cpp

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -886,6 +886,18 @@ TEST(DefragTest, Defrag)
886886
Url::Url("http://foo.com/path#fragment").defrag().str());
887887
}
888888

889+
TEST(PunycodeTest, UnpunycodeShortIdentifierAtEnd)
890+
{
891+
std::string example("http://www.xn-/");
892+
EXPECT_EQ(example, Url::Url(example).unpunycode().str());
893+
}
894+
895+
TEST(PunycodeTest, UnpunycodeShortIdentifierAtStart)
896+
{
897+
std::string example("http://xn-.com/");
898+
EXPECT_EQ(example, Url::Url(example).unpunycode().str());
899+
}
900+
889901
TEST(PunycodeTest, German)
890902
{
891903
std::string unencoded("http://www.kündigen.de/");

0 commit comments

Comments
 (0)