Skip to content

Commit 7679fe4

Browse files
committed
New libxml v2.14+ balanced chunk parsing implementation
1 parent 886ba6c commit 7679fe4

File tree

3 files changed

+42
-23
lines changed

3 files changed

+42
-23
lines changed

lib/LibXML/DocumentFragment.rakumod

Lines changed: 29 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
unit class LibXML::DocumentFragment;
33

44
use LibXML::Node;
5+
use LibXML::Enums;
56
use LibXML::Raw;
67
use LibXML::_ParentNode;
78
use LibXML::_Rawish;
@@ -66,13 +67,13 @@ class ParserContext is LibXML::Parser::Context {
6667
has Int $.stat is rw;
6768
has Str $.string;
6869
has Pointer $.user-data;
69-
has Pointer[xmlNode] $.nodes is rw .= new();
70+
has xmlNode $.nodes is rw;
7071
my Lock:D $lock .= new;
7172

7273
submethod DESTROY {
7374
$lock.protect: {
7475
if $!nodes {
75-
$!nodes.deref.FreeList(); ;
76+
$!nodes.FreeList();
7677
}
7778
}
7879
}
@@ -81,8 +82,8 @@ class ParserContext is LibXML::Parser::Context {
8182
my xmlNode $rv;
8283
$lock.protect: {
8384
if $!nodes {
84-
$rv = $!nodes.deref;
85-
$!nodes .= new();
85+
$rv = $!nodes;
86+
$!nodes = Nil;
8687
}
8788
}
8889
$rv;
@@ -123,7 +124,7 @@ proto method parse(
123124

124125
multi method parse(
125126
::?CLASS:U:
126-
Str:D() :$string,
127+
Str:D() :$string,
127128
Bool :balanced($)! where .so,
128129
Pointer :$user-data,
129130
|c) is hidden-from-backtrace {
@@ -143,16 +144,31 @@ multi method parse(
143144
$ctx.do: {
144145
# simple closures tend to leak on native callbacks. use dynamic variables
145146
my $ctx := $*XML-CONTEXT;
146-
my xmlSAXHandler $sax = .raw with $ctx.sax-handler;
147-
my $doc = $ctx.doc-frag.raw.doc;
148-
my Pointer $user-data = $ctx.user-data;
149-
temp LibXML::Raw.KeepBlanksDefault = $ctx.keep-blanks;
150147

151-
$ctx.stat = ($doc // xmlDoc).xmlParseBalancedChunkMemoryRecover(
152-
($sax // xmlSAXHandler), ($ctx.user-data // Pointer), 0, $ctx.string, $ctx.nodes, +$ctx.recover
153-
);
154-
};
148+
if $.config.version >= v2.14.0 {
149+
my xmlDoc:D $doc = self.raw.doc // xmlDoc.new;
150+
my $raw = ($doc.type == XML_HTML_DOCUMENT_NODE
151+
?? htmlParserCtxt.new
152+
!! xmlParserCtxt.new);
153+
my xmlParserInput $input .= new: :$string;
154+
$raw.myDoc = $doc;
155+
$ctx.set-raw: $raw;
156+
$ctx.nodes = $raw.ParseContent($input, $doc, 0);
157+
}
158+
else {
159+
my xmlSAXHandler $sax = .raw with $ctx.sax-handler;
160+
my $doc = $ctx.doc-frag.raw.doc;
161+
my Pointer $user-data = $ctx.user-data;
162+
temp LibXML::Raw.KeepBlanksDefault = $ctx.keep-blanks;
163+
my Pointer[xmlNode] $nodes-p .= new;
164+
$ctx.stat = ($doc // xmlDoc).xmlParseBalancedChunkMemoryRecover(
165+
($sax // xmlSAXHandler), ($ctx.user-data // Pointer), 0, $ctx.string, $nodes-p, +$ctx.recover
166+
);
167+
$ctx.nodes = $nodes-p ?? $nodes-p.deref !! Nil;
168+
}
155169

170+
LEAVE .close() with $ctx;
171+
}
156172
# just in case, we didn't catch the error
157173
die "balanced parse failed with status {$ctx.stat}"
158174
if $ctx.stat && !$ctx.recover;

lib/LibXML/Raw.rakumod

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -382,7 +382,12 @@ class xmlParserInput is export {
382382
has int32 $.standalone; # Was that entity marked standalone
383383
has int32 $.id; # a unique identifier for the entity
384384

385+
# since libxml 2.14.0
386+
our sub NewFromString(Str $url, Str $str, int32 $flags --> ::?CLASS) is native($XML2) is symbol('xmlNewInputFromString') {*}
385387
method Free is native($XML2) is symbol('xmlFreeInputStream') {*}
388+
method new(Str:D :$string!, Str :$url, UInt:D :$flags = 0) {
389+
NewFromString($url, $string, $flags);
390+
}
386391
}
387392

388393
#| An XML Element content as stored after parsing an element definition
@@ -1632,6 +1637,8 @@ class xmlParserCtxt is export {
16321637
method UseOptions(int32 --> int32) is native($XML2) is symbol('xmlCtxtUseOptions') { * }
16331638
method NewInputStream(xmlParserInputBuffer, int32 $enc --> xmlParserInput) is native($XML2) is symbol('xmlNewIOInputStream') is export {*}
16341639
method NewInputFile(Str --> xmlParserInput) is native($XML2) is export is symbol('xmlNewInputFromFile') {*}
1640+
# available since libxml2 v2.14.0
1641+
method ParseContent(xmlParserInput, xmlNode, int32 $has-text-decl --> xmlNode) is native($XML2) is export is symbol('xmlCtxtParseContent') {*}
16351642
# deprecated
16361643
method SetStructuredErrorFunc( &error-func (xmlParserCtxt $, xmlError $)) is native($XML2) is symbol('xmlSetStructuredErrorFunc') {*};
16371644
# recommended libxml 2.13.0+

t/02parse.t

Lines changed: 6 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -124,8 +124,7 @@ my %goodPushWF = (
124124
);
125125

126126
my $goodfile = "samples/dromeds.xml";
127-
my $badfile1 = "samples/bad.xml";
128-
my $badfile2 = "does_not_exist.xml";
127+
my $badfile = "samples/bad.xml";
129128

130129
my LibXML $parser .= new();
131130

@@ -201,7 +200,7 @@ subtest 'pedantic-parser', {
201200
subtest 'parse :file', {
202201
lives-ok {my LibXML::Document:D $ = $parser.parse(:file($goodfile));}
203202

204-
throws-like( { $parser.parse(:file($badfile1))},
203+
throws-like( { $parser.parse(:file($badfile))},
205204
X::LibXML::Parser,
206205
"Error thrown with bad xml file");
207206

@@ -231,12 +230,12 @@ else {
231230
isa-ok $io, IO::Handle;
232231
lives-ok {my LibXML::Document:D $ = $parser.parse: :$io;}
233232

234-
$io = $badfile1.IO;
233+
$io = $badfile.IO;
235234
isa-ok $io, IO::Path;
236235
throws-like(
237236
{ $parser.parse: :$io; },
238237
X::LibXML::Parser, :message(rx/:s Extra content at the end of the document/),
239-
"error parsing bad file from file handle of $badfile1"
238+
"error parsing bad file from file handle of $badfile"
240239
);
241240
}
242241
}
@@ -301,7 +300,7 @@ subtest 'x-include processing', {
301300
throws-like { $doc = $parser.parse: :string( $badXInclude ); },
302301
X::LibXML::Parser,
303302
:message(rx/'samples/bad.xml:3: parser error : Extra content at the end of the document'/),
304-
"error parsing $badfile1 in include";
303+
"error parsing $badfile in include";
305304
ok !$doc.defined, "no doc returned";
306305

307306
# some bad stuff
@@ -536,7 +535,6 @@ subtest 'parse well balanced chunks', {
536535
"<ouch>",
537536
"<ouch>bar",
538537
"bar</ouch>",
539-
"<ouch/>&foo;", # undefined entity
540538
"&", # bad char
541539
## "h\x[e4]h?", # bad encoding
542540
"<!--->", # bad stays bad ;)
@@ -564,7 +562,7 @@ subtest 'parse well balanced chunks', {
564562
next;
565563
}
566564
}
567-
fail("Unexpected fragment without child nodes");
565+
flunk("Unexpected fragment without child nodes");
568566
}
569567
}
570568

@@ -916,5 +914,3 @@ subtest 'Perl fossil', {
916914
};
917915

918916
}
919-
920-

0 commit comments

Comments
 (0)