Skip to content

Commit 304a52d

Browse files
committed
simple_dom: respect the size of text, and make parsers exception-safe. (#951)
1 parent 8ed13f0 commit 304a52d

File tree

6 files changed

+70
-23
lines changed

6 files changed

+70
-23
lines changed

ecosystem/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ FetchContent_Declare(
2929
URL ${PHOTON_RAPIDXML_SOURCE}
3030
URL_HASH
3131
SHA256=c3f0b886374981bb20fabcf323d755db4be6dba42064599481da64a85f5b3571
32+
PATCH_COMMAND git apply ${CMAKE_CURRENT_SOURCE_DIR}/patches/rapidxml.patch
3233
UPDATE_DISCONNECTED 1)
3334
FetchContent_MakeAvailable(rapidxml)
3435
message(STATUS "Rapidxml source dir: ${rapidxml_SOURCE_DIR}")

ecosystem/patches/rapidjson.patch

Lines changed: 19 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ index 19f8849b..618492a4 100644
99
+ kParseBoolsAsStringFlag = 512, //!< Parse all booleans (true/false) as strings.
1010
kParseDefaultFlags = RAPIDJSON_PARSE_DEFAULT_FLAGS //!< Default parse flags. Can be customized by defining RAPIDJSON_PARSE_DEFAULT_FLAGS
1111
};
12-
12+
1313
@@ -201,6 +202,8 @@ struct BaseReaderHandler {
1414
bool Default() { return true; }
1515
bool Null() { return static_cast<Override&>(*this).Default(); }
@@ -22,7 +22,7 @@ index 19f8849b..618492a4 100644
2222
@@ -714,13 +717,22 @@ private:
2323
RAPIDJSON_PARSE_ERROR(kParseErrorValueInvalid, is.Tell());
2424
}
25-
25+
2626
+ template<unsigned parseFlags, typename InputStream, typename Handler>
2727
+ void ParseRawBools(InputStream& is, Handler& handler) {
2828
+
@@ -33,7 +33,7 @@ index 19f8849b..618492a4 100644
3333
RAPIDJSON_ASSERT(is.Peek() == 't');
3434
+ auto begin = is.PutBegin();
3535
is.Take();
36-
36+
3737
if (RAPIDJSON_LIKELY(Consume(is, 'r') && Consume(is, 'u') && Consume(is, 'e'))) {
3838
- if (RAPIDJSON_UNLIKELY(!handler.Bool(true)))
3939
+ auto copy = !(parseFlags & kParseInsituFlag);
@@ -49,7 +49,7 @@ index 19f8849b..618492a4 100644
4949
RAPIDJSON_ASSERT(is.Peek() == 'f');
5050
+ auto begin = is.PutBegin();
5151
is.Take();
52-
52+
5353
if (RAPIDJSON_LIKELY(Consume(is, 'a') && Consume(is, 'l') && Consume(is, 's') && Consume(is, 'e'))) {
5454
- if (RAPIDJSON_UNLIKELY(!handler.Bool(false)))
5555
+ auto copy = !(parseFlags & kParseInsituFlag);
@@ -59,3 +59,18 @@ index 19f8849b..618492a4 100644
5959
RAPIDJSON_PARSE_ERROR(kParseErrorTermination, is.Tell());
6060
}
6161
else
62+
diff --git a/include/rapidjson/stream.h b/include/rapidjson/stream.h
63+
index fef82c25..cd51ccd3 100644
64+
--- a/include/rapidjson/stream.h
65+
+++ b/include/rapidjson/stream.h
66+
@@ -147,8 +147,8 @@ struct GenericInsituStringStream {
67+
GenericInsituStringStream(Ch *src) : src_(src), dst_(0), head_(src) {}
68+
69+
// Read
70+
- Ch Peek() { return *src_; }
71+
- Ch Take() { return *src_++; }
72+
+ Ch Peek() { return *src_ ? *src_ : '}'; }
73+
+ Ch Take() { return *src_ ? *src_++ : '}'; }
74+
size_t Tell() { return static_cast<size_t>(src_ - head_); }
75+
76+
// Write

ecosystem/patches/rapidxml.patch

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
--- rapidxml.hpp
2+
+++ rapidxml.hpp
3+
@@ -2205,6 +2205,8 @@
4+
}
5+
// Skip remaining whitespace after node name
6+
skip<whitespace_pred, Flags>(text);
7+
+ if (*text == Ch('\0'))
8+
+ return; // treat it as '>' without increament of text
9+
if (*text != Ch('>'))
10+
RAPIDXML_PARSE_ERROR("expected >", text);
11+
++text; // Skip '>'

ecosystem/simple_dom.cpp

Lines changed: 26 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
#include <stdlib.h>
88
#include <vector>
99
#include <algorithm>
10+
#include <memory>
1011
#include <photon/common/alog.h>
1112
#include <photon/common/alog-stdstring.h>
1213
#include <photon/common/utility.h>
@@ -181,11 +182,13 @@ struct JHandler : public BaseReaderHandler<UTF8<>, JHandler> {
181182
_root = new JNode(text, text_ownership);
182183
}
183184
~JHandler() {
185+
delete _root;
186+
}
187+
JNode* get_root() {
184188
assert(_nodes.size() == 1);
185189
assert(_nodes.front().size() == 1);
186190
_root->set_children(std::move(_nodes.front().front()._children));
187-
}
188-
JNode* get_root() {
191+
DEFER(_root = nullptr);
189192
return _root;
190193
}
191194
void emplace_back(const char* s, size_t length, uint8_t type) {
@@ -248,9 +251,18 @@ struct JHandler : public BaseReaderHandler<UTF8<>, JHandler> {
248251
}
249252
};
250253

254+
// As some parsers don't support text length, they only support null
255+
// terminated strings, so we have to convert the last trailer to '\0',
256+
// while making the parser to treat it as the trailer.
257+
inline void fix_trail(char* text, size_t size, char trailer) {
258+
auto i = estring_view(text, size).rfind(trailer);
259+
if (i != estring_view::npos) text[i] = '\0';
260+
}
261+
251262
static NodeImpl* parse_json(char* text, size_t size, int flags) {
252263
const auto kFlags = kParseNumbersAsStringsFlag | kParseBoolsAsStringFlag |
253264
kParseInsituFlag | kParseCommentsFlag | kParseTrailingCommasFlag;
265+
fix_trail(text, size, '}');
254266
JHandler h(text, flags & DOC_FREE_TEXT_ON_DESTRUCTION);
255267
using Encoding = UTF8<>;
256268
GenericInsituStringStream<Encoding> s(text);
@@ -299,12 +311,13 @@ class XMLNode : public DocNode<XMLNode> {
299311
};
300312

301313
static NodeImpl* parse_xml(char* text, size_t size, int flags) {
314+
fix_trail(text, size, '>');
302315
xml_document<char> doc;
303316
doc.parse<0>(text);
304-
auto root = new XMLNode(text, flags & DOC_FREE_TEXT_ON_DESTRUCTION);
317+
auto root = make_unique<XMLNode>(text, flags & DOC_FREE_TEXT_ON_DESTRUCTION);
305318
assert(root);
306319
root->build(&doc);
307-
return root;
320+
return root.release();
308321
}
309322

310323
class YAMLNode : public DocNode<YAMLNode> {
@@ -330,10 +343,10 @@ class YAMLNode : public DocNode<YAMLNode> {
330343

331344
static NodeImpl* parse_yaml(char* text, size_t size, int flags) {
332345
auto yaml = ryml::parse_in_place({text, size});
333-
auto root = new YAMLNode(text, flags & DOC_FREE_TEXT_ON_DESTRUCTION);
346+
auto root = make_unique<YAMLNode>(text, flags & DOC_FREE_TEXT_ON_DESTRUCTION);
334347
assert(root);
335348
root->build(yaml.rootref());
336-
return root;
349+
return root.release();
337350
}
338351

339352
class IniNode : public DocNode<IniNode> {
@@ -394,24 +407,24 @@ static NodeImpl* parse_ini(char* text, size_t size, int flags) {
394407
sort(ctx.begin(), ctx.end());
395408
vector<IniNode> sections, nodes;
396409
estring_view prev_sect;
397-
auto root = new IniNode(text, flags & DOC_FREE_TEXT_ON_DESTRUCTION);
410+
auto root = make_unique<IniNode>(text, flags & DOC_FREE_TEXT_ON_DESTRUCTION);
398411
for (auto& x : ctx) {
399412
if (prev_sect != x.section) {
400413
prev_sect = x.section;
401414
if (!nodes.empty() && !sections.empty()) {
402415
sections.back().set_children(std::move(nodes));
403416
assert(nodes.empty());
404417
}
405-
sections.emplace_back(x.section, str{}, root);
418+
sections.emplace_back(x.section, str{}, root.get());
406419
}
407-
nodes.emplace_back(x.key, x.val, root);
420+
nodes.emplace_back(x.key, x.val, root.get());
408421
}
409422
if (!sections.empty()) {
410423
if (!nodes.empty())
411424
sections.back().set_children(std::move(nodes));
412425
root->set_children(std::move(sections));
413426
}
414-
return root;
427+
return root.release();
415428
}
416429

417430
Node parse(char* text, size_t size, int flags) {
@@ -425,7 +438,9 @@ Node parse(char* text, size_t size, int flags) {
425438
if (flags & DOC_FREE_TEXT_IF_PARSING_FAILED) free(text);
426439
LOG_ERROR_RETURN(EINVAL, nullptr, "invalid document type ", HEX(i));
427440
}
428-
auto r = parsers[i](text, size, flags);
441+
NodeImpl* r = nullptr;
442+
try { r = parsers[i](text, size, flags); }
443+
catch(...) { LOG_ERROR("parsing failed and exception caught"); }
429444
if (!r && (flags & DOC_FREE_TEXT_IF_PARSING_FAILED)) free(text);
430445
return r;
431446
}

ecosystem/simple_dom.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,11 @@ class Node {
9999
double to_double(double def_val = NAN) const {
100100
return value().to_double(def_val);
101101
}
102+
bool to_bool() const {
103+
assert(type() == TYPE::BOOLEAN);
104+
auto v = value();
105+
return v.size() && (v[0] == 't' || v[0] == 'T');
106+
}
102107
using TYPE = NodeImpl::TYPE;
103108

104109
bool operator==(str rhs) const { return value() == rhs; }

ecosystem/test/test_simple_dom.cpp

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ using namespace std;
3030
using namespace photon::SimpleDOM;
3131

3232
// OSS list response
33-
const static char xml[] = R"(
33+
static char xml[] = R"(
3434
<?xml version="1.0" encoding="UTF-8"?>
3535
<ListBucketResult category = "flowers">
3636
<Name>examplebucket</Name>
@@ -89,7 +89,7 @@ void print_all2(Node node) {
8989

9090
static __attribute__((noinline))
9191
int do_list_object(string_view prefix, ObjectList& result, string* marker) {
92-
auto doc = parse_copy(xml, sizeof(xml), DOC_XML);
92+
auto doc = parse(xml, sizeof(xml)-1, DOC_XML);
9393
EXPECT_TRUE(doc);
9494
auto list_bucket_result = doc["ListBucketResult"];
9595
auto attr = list_bucket_result.get_attributes();
@@ -190,7 +190,7 @@ void expect_types(Node node, const std::pair<const char*, uint8_t> (&truth)[N])
190190
}
191191

192192
TEST(simple_dom, json) {
193-
const static char json0[] = R"({
193+
static char json0[] = R"({
194194
"hello": "world",
195195
"t": true ,
196196
"f": false,
@@ -199,7 +199,7 @@ TEST(simple_dom, json) {
199199
"pi": 3.1416,
200200
"a": [1, 2, 3, 4],
201201
})";
202-
auto doc = parse_copy(json0, sizeof(json0), DOC_JSON);
202+
auto doc = parse(json0, sizeof(json0)-1, DOC_JSON);
203203
EXPECT_TRUE(doc);
204204
expect_eq_kvs(doc, {
205205
{"hello", "world"},
@@ -220,7 +220,7 @@ TEST(simple_dom, json) {
220220

221221
TEST(simple_dom, yaml0) {
222222
static char yaml0[] = "{foo: 1, bar: [2, 3], john: doe}";
223-
auto doc = parse(yaml0, sizeof(yaml0), DOC_YAML);
223+
auto doc = parse(yaml0, sizeof(yaml0)-1, DOC_YAML);
224224
EXPECT_TRUE(doc);
225225
expect_eq_kvs(doc, {{"foo", "1"}, {"john", "doe"}});
226226
expect_eq_vals(doc["bar"], {"2", "3"});
@@ -245,7 +245,7 @@ newmap: {}
245245
newmap (serialized): {}
246246
I am something: indeed
247247
)";
248-
auto doc = parse(yaml1, sizeof(yaml1), DOC_YAML);
248+
auto doc = parse(yaml1, sizeof(yaml1)-1, DOC_YAML);
249249
EXPECT_TRUE(doc);
250250
expect_eq_kvs(doc, {
251251
{"foo", "says who"},
@@ -259,7 +259,7 @@ I am something: indeed
259259
"oh so nice", "oh so nice (serialized)"});
260260
}
261261

262-
const static char example_ini[] = R"(
262+
static char example_ini[] = R"(
263263
[protocol] ; Protocol configuration
264264
version=6 ; IPv6
265265
@@ -306,7 +306,7 @@ funny4 : two : colons
306306
)";
307307

308308
TEST(simple_dom, ini) {
309-
auto doc = parse_copy(example_ini, sizeof(example_ini) - 1, DOC_INI);
309+
auto doc = parse(example_ini, sizeof(example_ini)-1, DOC_INI);
310310
EXPECT_TRUE(doc);
311311
EXPECT_EQ(doc.num_children(), 6);
312312
expect_eq_kvs(doc["protocol"], {

0 commit comments

Comments
 (0)