1 // Written in the D programming language 2 3 /++ 4 This implements a range-based 5 $(LINK2 https://en.wikipedia.org/wiki/StAX, StAX _parser) for XML 1.0 (which 6 will work with XML 1.1 documents assuming that they don't use any 7 1.1-specific features). For the sake of simplicity, sanity, and efficiency, 8 the $(LINK2 https://en.wikipedia.org/wiki/Document_type_definition, DTD) 9 section is not supported beyond what is required to parse past it. 10 11 Start tags, end tags, comments, cdata sections, and processing instructions 12 are all supported and reported to the application. Anything in the DTD is 13 skipped (though it's parsed enough to parse past it correctly, and that 14 $(I can) result in an $(LREF XMLParsingException) if that XML isn't valid 15 enough to be correctly skipped), and the 16 $(LINK2 http://www.w3.org/TR/REC-xml/#NT-XMLDecl, XML declaration) at the 17 top is skipped if present (XML 1.1 requires that it be there, but XML 1.0 18 does not). 19 20 Regardless of what the XML declaration says (if present), any range of 21 $(K_CHAR) will be treated as being encoded in UTF-8, any range of 22 $(K_WCHAR) will be treated as being encoded in UTF-16, and any range of 23 $(K_DCHAR) will be treated as having been encoded in UTF-32. Strings will 24 be treated as ranges of their code units, not code points. Note that like 25 Phobos typically does when processing strings, the code assumes that BOMs 26 have already been removed, so if the range of characters comes from a file 27 that uses a BOM, the calling code needs to strip it out before calling 28 $(LREF parseXML), or parsing will fail due to invalid characters. 29 30 Since the DTD is skipped, entity references other than the five which are 31 predefined by the XML spec cannot be fully processed (since wherever they 32 were used in the document would be replaced by what they referred to, which 33 could be arbitrarily complex XML). As such, by default, if any entity 34 references which are not predefined are encountered outside of the DTD, an 35 $(LREF XMLParsingException) will be thrown (see 36 $(LREF Config.throwOnEntityRef) for how that can be configured). The 37 predefined entity references and any character references encountered will 38 be checked to verify that they're valid, but they will not be replaced 39 (since that does not work with returning slices of the original input). 40 41 However, $(REF_ALTTEXT decodeXML, decodeXML, dxml, util) or 42 $(REF_ALTTEXT parseStdEntityRef, parseStdEntityRef, dxml, util) from 43 $(MREF dxml, util) can be used to convert the predefined entity references 44 to what the refer to, and $(REF_ALTTEXT decodeXML, decodeXML, dxml, util) or 45 $(REF_ALTTEXT parseCharRef, parseCharRef, dxml, util) from 46 $(MREF dxml, util) can be used to convert character references to what they 47 refer to. 48 49 $(H3 Primary Symbols) 50 $(TABLE 51 $(TR $(TH Symbol) $(TH Description)) 52 $(TR $(TD $(LREF parseXML)) 53 $(TD The function used to initiate the parsing of an XML 54 document.)) 55 $(TR $(TD $(LREF EntityRange)) 56 $(TD The range returned by $(LREF parseXML).)) 57 $(TR $(TD $(LREF EntityRange.Entity)) 58 $(TD The element type of $(LREF EntityRange).)) 59 ) 60 61 $(H3 Parser Configuration Helpers) 62 $(TABLE 63 $(TR $(TH Symbol) $(TH Description)) 64 $(TR $(TD $(LREF Config)) 65 $(TD Used to configure how $(LREF EntityRange) parses the XML.)) 66 $(TR $(TD $(LREF simpleXML)) 67 $(TD A user-friendly configuration for when the application just 68 wants the element tags and the data in between them.)) 69 $(TR $(TD $(LREF makeConfig)) 70 $(TD A convenience function for constructing a custom 71 $(LREF Config).)) 72 $(TR $(TD $(LREF SkipComments)) 73 $(TD A $(PHOBOS_REF Flag, std, typecons) used with $(LREF Config) 74 to tell the parser to skip comments.)) 75 $(TR $(TD $(LREF SkipPI)) 76 $(TD A $(PHOBOS_REF Flag, std, typecons) used with $(LREF Config) 77 to tell the parser to skip processing instructions.)) 78 $(TR $(TD $(LREF SplitEmpty)) 79 $(TD A $(PHOBOS_REF Flag, std, typecons) used with $(LREF Config) 80 to configure how the parser deals with empty element tags.)) 81 ) 82 83 $(H3 Helper Types Used When Parsing) 84 $(TABLE 85 $(TR $(TH Symbol) $(TH Description)) 86 $(TR $(TD $(LREF EntityType)) 87 $(TD The type of an entity in the XML (e.g. a 88 $(LREF_ALTTEXT start tag, EntityType.elementStart) or a 89 $(LREF_ALTTEXT comment, EntityType.comment)).)) 90 $(TR $(TD $(LREF TextPos)) 91 $(TD Gives the line and column number in the XML document.)) 92 $(TR $(TD $(LREF XMLParsingException)) 93 $(TD Thrown by $(LREF EntityRange) when it encounters invalid 94 XML.)) 95 ) 96 97 $(H3 Helper Functions Used When Parsing) 98 $(TABLE 99 $(TR $(TH Symbol) $(TH Description)) 100 $(TR $(TD $(LREF getAttrs)) 101 $(TD A function similar to $(PHOBOS_REF getopt, std, getopt) which 102 allows for the easy processing of start tag attributes.)) 103 $(TR $(TD $(LREF skipContents)) 104 $(TD Iterates an $(LREF EntityRange) from a start tag to its 105 matching end tag.)) 106 $(TR $(TD $(LREF skipToPath)) 107 $(TD Used to navigate from one start tag to another as if the start 108 tag names formed a file path.)) 109 $(TR $(TD $(LREF skipToEntityType)) 110 $(TD Skips to the next entity of the given type in the range.)) 111 $(TR $(TD $(LREF skipToParentEndTag)) 112 $(TD Iterates an $(LREF EntityRange) until it reaches the end tag 113 that matches the start tag which is the parent of the 114 current entity.)) 115 ) 116 117 $(H3 Helper Traits) 118 $(TABLE 119 $(TR $(TH Symbol) $(TH Description)) 120 $(TR $(TD $(LREF isAttrRange)) 121 $(TD Whether the given range is a range of attributes.))) 122 123 Copyright: Copyright 2017 - 2025 124 License: $(HTTP www.boost.org/LICENSE_1_0.txt, Boost License 1.0). 125 Authors: $(HTTPS jmdavisprog.com, Jonathan M Davis) 126 Source: $(LINK_TO_SRC dxml/_parser.d) 127 128 See_Also: $(LINK2 http://www.w3.org/TR/REC-xml/, Official Specification for XML 1.0) 129 +/ 130 module dxml.parser; 131 132 /// 133 unittest 134 { 135 auto xml = "<!-- comment -->\n" ~ 136 "<root>\n" ~ 137 " <foo>some text<whatever/></foo>\n" ~ 138 " <bar/>\n" ~ 139 " <baz></baz>\n" ~ 140 "</root>"; 141 { 142 auto range = parseXML(xml); 143 assert(range.front.type == EntityType.comment); 144 assert(range.front.text == " comment "); 145 range.popFront(); 146 147 assert(range.front.type == EntityType.elementStart); 148 assert(range.front.name == "root"); 149 range.popFront(); 150 151 assert(range.front.type == EntityType.elementStart); 152 assert(range.front.name == "foo"); 153 range.popFront(); 154 155 assert(range.front.type == EntityType.text); 156 assert(range.front.text == "some text"); 157 range.popFront(); 158 159 assert(range.front.type == EntityType.elementEmpty); 160 assert(range.front.name == "whatever"); 161 range.popFront(); 162 163 assert(range.front.type == EntityType.elementEnd); 164 assert(range.front.name == "foo"); 165 range.popFront(); 166 167 assert(range.front.type == EntityType.elementEmpty); 168 assert(range.front.name == "bar"); 169 range.popFront(); 170 171 assert(range.front.type == EntityType.elementStart); 172 assert(range.front.name == "baz"); 173 range.popFront(); 174 175 assert(range.front.type == EntityType.elementEnd); 176 assert(range.front.name == "baz"); 177 range.popFront(); 178 179 assert(range.front.type == EntityType.elementEnd); 180 assert(range.front.name == "root"); 181 range.popFront(); 182 183 assert(range.empty); 184 } 185 { 186 auto range = parseXML!simpleXML(xml); 187 188 // simpleXML skips comments 189 190 assert(range.front.type == EntityType.elementStart); 191 assert(range.front.name == "root"); 192 range.popFront(); 193 194 assert(range.front.type == EntityType.elementStart); 195 assert(range.front.name == "foo"); 196 range.popFront(); 197 198 assert(range.front.type == EntityType.text); 199 assert(range.front.text == "some text"); 200 range.popFront(); 201 202 // simpleXML splits empty element tags into a start tag and end tag 203 // so that the code doesn't have to care whether a start tag with no 204 // content is an empty tag or a start tag and end tag with nothing but 205 // whitespace in between. 206 assert(range.front.type == EntityType.elementStart); 207 assert(range.front.name == "whatever"); 208 range.popFront(); 209 210 assert(range.front.type == EntityType.elementEnd); 211 assert(range.front.name == "whatever"); 212 range.popFront(); 213 214 assert(range.front.type == EntityType.elementEnd); 215 assert(range.front.name == "foo"); 216 range.popFront(); 217 218 assert(range.front.type == EntityType.elementStart); 219 assert(range.front.name == "bar"); 220 range.popFront(); 221 222 assert(range.front.type == EntityType.elementEnd); 223 assert(range.front.name == "bar"); 224 range.popFront(); 225 226 assert(range.front.type == EntityType.elementStart); 227 assert(range.front.name == "baz"); 228 range.popFront(); 229 230 assert(range.front.type == EntityType.elementEnd); 231 assert(range.front.name == "baz"); 232 range.popFront(); 233 234 assert(range.front.type == EntityType.elementEnd); 235 assert(range.front.name == "root"); 236 range.popFront(); 237 238 assert(range.empty); 239 } 240 } 241 242 243 import std.range.primitives; 244 import std.traits; 245 import std.typecons : Flag; 246 247 248 /++ 249 The exception type thrown when the XML parser encounters invalid XML. 250 +/ 251 class XMLParsingException : Exception 252 { 253 /++ 254 The position in the XML input where the problem is. 255 +/ 256 TextPos pos; 257 258 package: 259 260 this(string msg, TextPos textPos, string file = __FILE__, size_t line = __LINE__) @safe pure 261 { 262 import std.format : format; 263 super(format!"[%s:%s]: %s"(textPos.line, textPos.col, msg), file, line); 264 pos = textPos; 265 } 266 } 267 268 269 /++ 270 Where in the XML document an entity is. 271 272 The line and column numbers are 1-based. 273 274 The primary use case for TextPos is $(LREF XMLParsingException), but an 275 application may have other uses for it. The TextPos for an 276 $(LREF2 Entity, EntityRange) can be obtained from 277 $(LREF2 Entity.pos, EntityRange). 278 279 See_Also: $(LREF XMLParsingException.pos)$(BR) 280 $(LREF EntityRange.Entity.pos) 281 +/ 282 struct TextPos 283 { 284 /// A line number in the XML file. 285 int line = 1; 286 287 /++ 288 A column number in a line of the XML file. 289 290 Each code unit is considered a column, so depending on what a program 291 is looking to do with the column number, it may need to examine the 292 actual text on that line and calculate the number that represents 293 what the program wants to display (e.g. the number of graphemes). 294 +/ 295 int col = 1; 296 } 297 298 299 /++ 300 Used to configure how the parser works. 301 302 See_Also: 303 $(LREF makeConfig)$(BR) 304 $(LREF parseXML)$(BR) 305 $(LREF simpleXML) 306 +/ 307 struct Config 308 { 309 /++ 310 Whether the comments should be skipped while parsing. 311 312 If $(D skipComments == SkipComments.yes), any entities of type 313 $(LREF EntityType.comment) will be omitted from the parsing results, 314 and they will not be validated beyond what is required to parse past 315 them. 316 317 Defaults to $(D SkipComments.no). 318 +/ 319 auto skipComments = SkipComments.no; 320 321 /++ 322 Whether processing instructions should be skipped. 323 324 If $(D skipPI == SkipPI.yes), any entities of type 325 $(LREF EntityType.pi) will be skipped, and they will not be validated 326 beyond what is required to parse past them. 327 328 Defaults to $(D SkipPI.no). 329 +/ 330 auto skipPI = SkipPI.no; 331 332 /++ 333 Whether the parser should report empty element tags as if they were a 334 start tag followed by an end tag with nothing in between. 335 336 If $(D splitEmpty == SplitEmpty.yes), then whenever an 337 $(LREF EntityType.elementEmpty) is encountered, the parser will claim 338 that that entity is an $(LREF EntityType.elementStart), and then it 339 will provide an $(LREF EntityType.elementEnd) as the next entity before 340 the entity that actually follows it. 341 342 The purpose of this is to simplify the code using the parser, since most 343 code does not care about the difference between an empty tag and a start 344 and end tag with nothing in between. But since some code may care about 345 the difference, the behavior is configurable. 346 347 Defaults to $(D SplitEmpty.no). 348 +/ 349 auto splitEmpty = SplitEmpty.no; 350 351 /// 352 unittest 353 { 354 enum configSplitYes = makeConfig(SplitEmpty.yes); 355 356 { 357 auto range = parseXML("<root></root>"); 358 assert(range.front.type == EntityType.elementStart); 359 assert(range.front.name == "root"); 360 range.popFront(); 361 assert(range.front.type == EntityType.elementEnd); 362 assert(range.front.name == "root"); 363 range.popFront(); 364 assert(range.empty); 365 } 366 { 367 // No difference if the tags are already split. 368 auto range = parseXML!configSplitYes("<root></root>"); 369 assert(range.front.type == EntityType.elementStart); 370 assert(range.front.name == "root"); 371 range.popFront(); 372 assert(range.front.type == EntityType.elementEnd); 373 assert(range.front.name == "root"); 374 range.popFront(); 375 assert(range.empty); 376 } 377 { 378 // This treats <root></root> and <root/> as distinct. 379 auto range = parseXML("<root/>"); 380 assert(range.front.type == EntityType.elementEmpty); 381 assert(range.front.name == "root"); 382 range.popFront(); 383 assert(range.empty); 384 } 385 { 386 // This is parsed as if it were <root></root> insead of <root/>. 387 auto range = parseXML!configSplitYes("<root/>"); 388 assert(range.front.type == EntityType.elementStart); 389 assert(range.front.name == "root"); 390 range.popFront(); 391 assert(range.front.type == EntityType.elementEnd); 392 assert(range.front.name == "root"); 393 range.popFront(); 394 assert(range.empty); 395 } 396 } 397 398 /++ 399 Whether the parser should throw when it encounters any entity references 400 other than the five entity references defined in the XML standard. 401 402 Any other entity references would have to be defined in the DTD in 403 order to be valid. And in order to know what XML they represent (which 404 could be arbitrarily complex, even effectively inserting entire XML 405 documents into the middle of the XML), the DTD would have to be parsed. 406 However, dxml does not support parsing the DTD beyond what is required 407 to correctly parse past it, and replacing entity references with what 408 they represent would not work with the slicing semantics that 409 $(LREF EntityRange) provides. As such, it is not possible for dxml to 410 correctly handle any entity references other than the five which are 411 defined in the XML standard, and even those are only parsed by using 412 $(REF decodeXML, dxml, util) or $(REF parseStdEntityRef, dxml, util). 413 $(LREF EntityRange) always validates that entity references are one 414 of the five, predefined entity references, but otherwise, it lets them 415 pass through as normal text. It does not replace them with what they 416 represent. 417 418 As such, the default behavior of $(LREF EntityRange) is to throw an 419 $(LREF XMLParsingException) when it encounters an entity reference 420 which is not one of the five defined by the XML standard. With that 421 behavior, there is no risk of processing an XML document as if it had 422 no entity references and ending up with what the program using the 423 parser would probably consider incorrect results. However, there are 424 cases where a program may find it acceptable to treat entity references 425 as normal text and ignore them. As such, if a program wishes to take 426 that approach, it can set throwOnEntityRef to $(D ThrowOnEntityRef.no). 427 428 If $(D throwOnEntityRef == ThrowOnEntityRef.no), then any entity 429 reference that it encounters will be validated to ensure that it is 430 syntactically valid (i.e. that the characters it contains form what 431 could be a valid entity reference assuming that the DTD declared it 432 properly), but otherwise, $(LREF EntityRange) will treat it as normal 433 text, just like it treats the five, predefined entity references as 434 normal text. 435 436 Note that any valid XML entity reference which contains start or end 437 tags must contain matching start or end tags, and entity references 438 cannot contain incomplete fragments of XML (e.g. the start or end of a 439 comment). So, missing entity references should only affect the data in 440 the XML document and not its overall structure (if that were not _true, 441 attempting to ignore entity references such as $(D ThrowOnEntityRef.no) 442 does would be a disaster in the making). However, how reasonable it is 443 to miss that data depends entirely on the application and what the XML 444 documents it's parsing contain - hence, the behavior is configurable. 445 446 See_Also: $(REF StdEntityRef, dxml, util)$(BR) 447 $(REF parseStdEntityRef, dxml, util)$(BR) 448 $(REF parseCharRef, dxml, util)$(BR) 449 $(REF encodeCharRef, dxml, util)$(BR) 450 $(REF decodeXML, dxml, util)$(BR) 451 $(REF asDecodedXML, dxml, util) 452 +/ 453 auto throwOnEntityRef = ThrowOnEntityRef.yes; 454 455 /// 456 unittest 457 { 458 import std.exception : assertThrown; 459 import dxml.util : decodeXML; 460 461 auto xml = "<root>\n" ~ 462 " <std>&'><"</std>\n" ~ 463 " <other>&foobar;</other>\n" ~ 464 " <invalid>&--;</invalid>\n" ~ 465 "</root>"; 466 467 // ThrowOnEntityRef.yes 468 { 469 auto range = parseXML(xml); 470 assert(range.front.type == EntityType.elementStart); 471 assert(range.front.name == "root"); 472 473 range.popFront(); 474 assert(range.front.type == EntityType.elementStart); 475 assert(range.front.name == "std"); 476 477 range.popFront(); 478 assert(range.front.type == EntityType.text); 479 assert(range.front.text == "&'><""); 480 assert(range.front.text.decodeXML() == `&'><"`); 481 482 range.popFront(); 483 assert(range.front.type == EntityType.elementEnd); 484 assert(range.front.name == "std"); 485 486 range.popFront(); 487 assert(range.front.type == EntityType.elementStart); 488 assert(range.front.name == "other"); 489 490 // Attempted to parse past "&foobar;", which is syntactically 491 // valid, but it's not one of the five predefined entity references. 492 assertThrown!XMLParsingException(range.popFront()); 493 } 494 495 // ThrowOnEntityRef.no 496 { 497 auto range = parseXML!(makeConfig(ThrowOnEntityRef.no))(xml); 498 assert(range.front.type == EntityType.elementStart); 499 assert(range.front.name == "root"); 500 501 range.popFront(); 502 assert(range.front.type == EntityType.elementStart); 503 assert(range.front.name == "std"); 504 505 range.popFront(); 506 assert(range.front.type == EntityType.text); 507 assert(range.front.text == "&'><""); 508 assert(range.front.text.decodeXML() == `&'><"`); 509 510 range.popFront(); 511 assert(range.front.type == EntityType.elementEnd); 512 assert(range.front.name == "std"); 513 514 range.popFront(); 515 assert(range.front.type == EntityType.elementStart); 516 assert(range.front.name == "other"); 517 518 // Doesn't throw, because "&foobar;" is syntactically valid. 519 range.popFront(); 520 assert(range.front.type == EntityType.text); 521 assert(range.front.text == "&foobar;"); 522 523 // decodeXML has no effect on non-standard entity references. 524 assert(range.front.text.decodeXML() == "&foobar;"); 525 526 range.popFront(); 527 assert(range.front.type == EntityType.elementEnd); 528 assert(range.front.name == "other"); 529 530 range.popFront(); 531 assert(range.front.type == EntityType.elementStart); 532 assert(range.front.name == "invalid"); 533 534 // Attempted to parse past "&--;", which is not syntactically valid, 535 // because -- is not a valid name for an entity reference. 536 assertThrown!XMLParsingException(range.popFront()); 537 } 538 } 539 } 540 541 542 /// See_Also: $(LREF2 skipComments, Config) 543 alias SkipComments = Flag!"SkipComments"; 544 545 /// See_Also: $(LREF2 skipPI, Config) 546 alias SkipPI = Flag!"SkipPI"; 547 548 /// See_Also: $(LREF2 splitEmpty, Config) 549 alias SplitEmpty = Flag!"SplitEmpty"; 550 551 /// See_Also: $(LREF2 throwOnEntityRef, Config) 552 alias ThrowOnEntityRef = Flag!"ThrowOnEntityRef"; 553 554 555 /++ 556 Helper function for creating a custom config. It makes it easy to set one 557 or more of the member variables to something other than the default without 558 having to worry about explicitly setting them individually or setting them 559 all at once via a constructor. 560 561 The order of the arguments does not matter. The types of each of the members 562 of Config are unique, so that information alone is sufficient to determine 563 which argument should be assigned to which member. 564 +/ 565 Config makeConfig(Args...)(Args args) 566 { 567 import std.format : format; 568 import std.meta : AliasSeq, staticIndexOf, staticMap; 569 570 template isValid(T, Types...) 571 { 572 static if(Types.length == 0) 573 enum isValid = false; 574 else static if(is(T == Types[0])) 575 enum isValid = true; 576 else 577 enum isValid = isValid!(T, Types[1 .. $]); 578 } 579 580 Config config; 581 582 alias TypeOfMember(string memberName) = typeof(__traits(getMember, config, memberName)); 583 alias MemberTypes = staticMap!(TypeOfMember, AliasSeq!(__traits(allMembers, Config))); 584 585 foreach(i, arg; args) 586 { 587 static assert(isValid!(typeof(arg), MemberTypes), 588 format!"Argument %s does not match the type of any members of Config"(i)); 589 590 static foreach(j, Other; Args) 591 { 592 static if(i != j) 593 static assert(!is(typeof(arg) == Other), format!"Argument %s and %s have the same type"(i, j)); 594 } 595 596 foreach(memberName; __traits(allMembers, Config)) 597 { 598 static if(is(typeof(__traits(getMember, config, memberName)) == typeof(arg))) 599 mixin("config." ~ memberName ~ " = arg;"); 600 } 601 } 602 603 return config; 604 } 605 606 /// 607 @safe pure nothrow @nogc unittest 608 { 609 { 610 auto config = makeConfig(SkipComments.yes); 611 assert(config.skipComments == SkipComments.yes); 612 assert(config.skipPI == Config.init.skipPI); 613 assert(config.splitEmpty == Config.init.splitEmpty); 614 assert(config.throwOnEntityRef == Config.init.throwOnEntityRef); 615 } 616 { 617 auto config = makeConfig(SkipComments.yes, SkipPI.yes); 618 assert(config.skipComments == SkipComments.yes); 619 assert(config.skipPI == SkipPI.yes); 620 assert(config.splitEmpty == Config.init.splitEmpty); 621 assert(config.throwOnEntityRef == Config.init.throwOnEntityRef); 622 } 623 { 624 auto config = makeConfig(SplitEmpty.yes, SkipComments.yes, ThrowOnEntityRef.no); 625 assert(config.skipComments == SkipComments.yes); 626 assert(config.skipPI == Config.init.skipPI); 627 assert(config.splitEmpty == SplitEmpty.yes); 628 assert(config.throwOnEntityRef == ThrowOnEntityRef.no); 629 } 630 } 631 632 unittest 633 { 634 import std.typecons : Flag; 635 static assert(!__traits(compiles, makeConfig(42))); 636 static assert(!__traits(compiles, makeConfig("hello"))); 637 static assert(!__traits(compiles, makeConfig(Flag!"SomeOtherFlag".yes))); 638 static assert(!__traits(compiles, makeConfig(SplitEmpty.yes, SplitEmpty.no))); 639 } 640 641 642 /++ 643 This $(LREF Config) is intended for making it easy to parse XML by skipping 644 everything that isn't the actual data as well as making it simpler to deal 645 with empty element tags by treating them the same as a start tag and end 646 tag with nothing but whitespace between them. 647 +/ 648 enum simpleXML = makeConfig(SkipComments.yes, SkipPI.yes, SplitEmpty.yes); 649 650 /// 651 @safe pure nothrow @nogc unittest 652 { 653 static assert(simpleXML.skipComments == SkipComments.yes); 654 static assert(simpleXML.skipPI == SkipPI.yes); 655 static assert(simpleXML.splitEmpty == SplitEmpty.yes); 656 static assert(simpleXML.throwOnEntityRef == ThrowOnEntityRef.yes); 657 } 658 659 660 /++ 661 Represents the type of an XML entity. Used by $(LREF EntityRange.Entity). 662 +/ 663 enum EntityType 664 { 665 /++ 666 A cdata section: `<![CDATA[ ... ]]>`. 667 668 See_Also: $(LINK http://www.w3.org/TR/REC-xml/#sec-cdata-sect) 669 +/ 670 cdata, 671 672 /++ 673 An XML comment: `<!-- ... -->`. 674 675 See_Also: $(LINK http://www.w3.org/TR/REC-xml/#sec-comments) 676 +/ 677 comment, 678 679 /++ 680 The start tag for an element. e.g. `<foo name="value">`. 681 682 See_Also: $(LINK http://www.w3.org/TR/REC-xml/#sec-starttags) 683 +/ 684 elementStart, 685 686 /++ 687 The end tag for an element. e.g. `</foo>`. 688 689 See_Also: $(LINK http://www.w3.org/TR/REC-xml/#sec-starttags) 690 +/ 691 elementEnd, 692 693 /++ 694 The tag for an element with no contents or matching end tag. e.g. 695 `<foo name="value"/>`. 696 697 See_Also: $(LINK http://www.w3.org/TR/REC-xml/#sec-starttags) 698 +/ 699 elementEmpty, 700 701 /++ 702 A processing instruction such as `<?foo?>`. Note that the 703 `<?xml ... ?>` is skipped and not treated as an $(LREF EntityType._pi). 704 705 See_Also: $(LINK http://www.w3.org/TR/REC-xml/#sec-pi) 706 +/ 707 pi, 708 709 /++ 710 The content of an element tag that is simple text. 711 712 If there is an entity other than the end tag following the text, then 713 the text includes up to that entity. 714 715 Note however that character references (e.g. 716 $(D_CODE_STRING "$(AMP)#42")) and the predefined entity references (e.g. 717 $(D_CODE_STRING "$(AMP)apos;")) are left unprocessed in the text. In 718 order for them to be processed, the text should be passed to either 719 $(REF_ALTTEXT decodeXML, decodeXML, dxml, util) or 720 $(REF_ALTTEXT asDecodedXML, asDecodedXML, dxml, util). Entity references 721 which are not predefined are considered invalid XML, because the DTD 722 section is skipped, and thus they cannot be processed properly. 723 724 See_Also: $(LINK http://www.w3.org/TR/REC-xml/#sec-starttags)$(BR) 725 $(REF decodeXML, dxml, util)$(BR) 726 $(REF asDecodedXML, dxml, util)$(BR) 727 $(REF parseStdEntityRef, dxml, util)$(BR) 728 $(REF parseCharRef, dxml, util)$(BR) 729 $(LREF EntityRange.Entity._text) 730 +/ 731 text, 732 } 733 734 735 /++ 736 Lazily parses the given range of characters as an XML document. 737 738 EntityRange is essentially a 739 $(LINK2 https://en.wikipedia.org/wiki/StAX, StAX) parser, though it evolved 740 into that rather than being based on what Java did, and it's range-based 741 rather than iterator-based, so its API is likely to differ from other 742 implementations. The basic concept should be the same though. 743 744 One of the core design goals of this parser is to slice the original input 745 rather than having to allocate strings for the output or wrap it in a lazy 746 range that produces a mutated version of the data. So, all of the text that 747 the parser provides is either a slice or 748 $(PHOBOS_REF takeExactly, std, range) of the input. However, in some cases, 749 for the parser to be fully compliant with the XML spec, 750 $(REF decodeXML, dxml, util) must be called on the text to mutate certain 751 constructs (e.g. removing any $(D_CODE_STRING '\r') in the text or 752 converting $(D_CODE_STRING "$(AMP)lt;") to $(D_CODE_STRING '<')). But 753 that's left up to the application. 754 755 The parser is not $(K_NOGC), but it allocates memory very minimally. It 756 allocates some of its state on the heap so it can validate attributes and 757 end tags. However, that state is shared among all the ranges that came from 758 the same call to parseXML (only the range farthest along in parsing 759 validates attributes or end tags), so $(LREF2 save, _EntityRange) does not 760 allocate memory unless $(D save) on the underlying range allocates memory. 761 The shared state currently uses a couple of dynamic arrays to validate the 762 tags and attributes, and if the document has a particularly deep tag depth 763 or has a lot of attributes on a start tag, then some reallocations may 764 occur until the maximum is reached, but enough is reserved that for most 765 documents, no reallocations will occur. The only other times that the 766 parser would allocate would be if an exception were thrown or if the range 767 that was passed to parseXML allocates for any reason when calling any of the 768 range primitives. 769 770 If invalid XML is encountered at any point during the parsing process, an 771 $(LREF XMLParsingException) will be thrown. If an exception has been thrown, 772 then the parser is in an invalid state, and it is an error to call any 773 functions on it. 774 775 However, note that XML validation is reduced for any entities that are 776 skipped (e.g. for anything in the DTD, validation is reduced to what is 777 required to correctly parse past it, and when 778 $(D Config.skipPI == SkipPI.yes), processing instructions are only validated 779 enough to correctly skip past them). 780 781 As the module documentation says, this parser does not provide any DTD 782 support. It is not possible to properly support the DTD while returning 783 slices of the original input, and the DTD portion of the spec makes parsing 784 XML far, far more complicated. 785 786 A quick note about carriage returns$(COLON) per the XML spec, they are all 787 supposed to either be stripped out or replaced with newlines or spaces 788 before the XML parser even processes the text. That doesn't work when the 789 parser is slicing the original text and not mutating it at all. So, for the 790 purposes of parsing, this parser treats all carriage returns as if they 791 were newlines or spaces (though they won't count as newlines when counting 792 the lines for $(LREF TextPos)). However, they $(I will) appear in any text 793 fields or attribute values if they are in the document (since the text 794 fields and attribute values are slices of the original text). 795 $(REF decodeXML, dxml, util) can be used to strip them along with 796 converting any character references in the text. Alternatively, the 797 application can remove them all before calling parseXML, but it's not 798 necessary. 799 +/ 800 struct EntityRange(Config cfg, R) 801 if(isForwardRange!R && isSomeChar!(ElementType!R)) 802 { 803 import std.algorithm : canFind; 804 import std.range : only, takeExactly; 805 import std.typecons : Nullable; 806 import std.utf : byCodeUnit; 807 808 enum compileInTests = is(R == EntityRangeCompileTests); 809 810 public: 811 812 /// The Config used for when parsing the XML. 813 alias config = cfg; 814 815 /// The type of the range that EntityRange is parsing. 816 alias Input = R; 817 818 /++ 819 The type used when any slice of the original input is used. If $(D R) 820 is a string or supports slicing, then SliceOfR is the same as $(D R); 821 otherwise, it's the result of calling 822 $(PHOBOS_REF takeExactly, std, range) on the input. 823 824 --- 825 import std.algorithm : filter; 826 import std.range : takeExactly; 827 828 static assert(is(EntityRange!(Config.init, string).SliceOfR == string)); 829 830 auto range = filter!(a => true)("some xml"); 831 832 static assert(is(EntityRange!(Config.init, typeof(range)).SliceOfR == 833 typeof(takeExactly(range, 42)))); 834 --- 835 +/ 836 static if(isDynamicArray!R || hasSlicing!R) 837 alias SliceOfR = R; 838 else 839 alias SliceOfR = typeof(takeExactly(R.init, 42)); 840 841 // https://issues.dlang.org/show_bug.cgi?id=11133 prevents this from being 842 // a ddoc-ed unit test. 843 static if(compileInTests) @safe unittest 844 { 845 import std.algorithm : filter; 846 import std.range : takeExactly; 847 848 static assert(is(EntityRange!(Config.init, string).SliceOfR == string)); 849 850 auto range = filter!(a => true)("some xml"); 851 852 static assert(is(EntityRange!(Config.init, typeof(range)).SliceOfR == 853 typeof(takeExactly(range, 42)))); 854 } 855 856 857 /++ 858 Represents an entity in the XML document. 859 860 Note that the $(LREF2 type, EntityRange._Entity) determines which 861 properties can be used, and it can determine whether functions which 862 an Entity or $(LREF EntityRange) is passed to are allowed to be called. 863 Each function lists which $(LREF EntityType)s are allowed, and it is an 864 error to call them with any other $(LREF EntityType). 865 +/ 866 struct Entity 867 { 868 public: 869 870 import std.typecons : Tuple; 871 872 /++ 873 The exact instantiation of $(PHOBOS_REF Tuple, std, typecons) that 874 $(LREF2 attributes, EntityRange.EntityType) returns a range of. 875 876 See_Also: $(LREF2 attributes, EntityRange.Entity) 877 +/ 878 alias Attribute = Tuple!(SliceOfR, "name", SliceOfR, "value", TextPos, "pos"); 879 880 881 /++ 882 The $(LREF EntityType) for this Entity. 883 +/ 884 @property EntityType type() @safe const pure nothrow @nogc 885 { 886 return _type; 887 } 888 889 /// 890 static if(compileInTests) unittest 891 { 892 auto xml = "<root>\n" ~ 893 " <!--no comment-->\n" ~ 894 " <![CDATA[cdata run]]>\n" ~ 895 " <text>I am text!</text>\n" ~ 896 " <empty/>\n" ~ 897 " <?pi?>\n" ~ 898 "</root>"; 899 900 auto range = parseXML(xml); 901 assert(range.front.type == EntityType.elementStart); 902 assert(range.front.name == "root"); 903 range.popFront(); 904 905 assert(range.front.type == EntityType.comment); 906 assert(range.front.text == "no comment"); 907 range.popFront(); 908 909 assert(range.front.type == EntityType.cdata); 910 assert(range.front.text == "cdata run"); 911 range.popFront(); 912 913 assert(range.front.type == EntityType.elementStart); 914 assert(range.front.name == "text"); 915 range.popFront(); 916 917 assert(range.front.type == EntityType.text); 918 assert(range.front.text == "I am text!"); 919 range.popFront(); 920 921 assert(range.front.type == EntityType.elementEnd); 922 assert(range.front.name == "text"); 923 range.popFront(); 924 925 assert(range.front.type == EntityType.elementEmpty); 926 assert(range.front.name == "empty"); 927 range.popFront(); 928 929 assert(range.front.type == EntityType.pi); 930 assert(range.front.name == "pi"); 931 range.popFront(); 932 933 assert(range.front.type == EntityType.elementEnd); 934 assert(range.front.name == "root"); 935 range.popFront(); 936 937 assert(range.empty); 938 } 939 940 941 /++ 942 The position in the the original text where the entity starts. 943 944 See_Also: $(LREF TextPos)$(BR) 945 $(LREF XMLParsingException._pos) 946 +/ 947 @property TextPos pos() @safe const pure nothrow @nogc 948 { 949 return _pos; 950 } 951 952 /// 953 static if(compileInTests) unittest 954 { 955 auto xml = "<root>\n" ~ 956 " <foo>\n" ~ 957 " Foo and bar. Always foo and bar...\n" ~ 958 " </foo>\n" ~ 959 "</root>"; 960 961 auto range = parseXML(xml); 962 assert(range.front.type == EntityType.elementStart); 963 assert(range.front.name == "root"); 964 assert(range.front.pos == TextPos(1, 1)); 965 range.popFront(); 966 967 assert(range.front.type == EntityType.elementStart); 968 assert(range.front.name == "foo"); 969 assert(range.front.pos == TextPos(2, 5)); 970 range.popFront(); 971 972 assert(range.front.type == EntityType.text); 973 assert(range.front.text == 974 "\n" ~ 975 " Foo and bar. Always foo and bar...\n" ~ 976 " "); 977 assert(range.front.pos == TextPos(2, 10)); 978 range.popFront(); 979 980 assert(range.front.type == EntityType.elementEnd); 981 assert(range.front.name == "foo"); 982 assert(range.front.pos == TextPos(4, 5)); 983 range.popFront(); 984 985 assert(range.front.type == EntityType.elementEnd); 986 assert(range.front.name == "root"); 987 assert(range.front.pos == TextPos(5, 1)); 988 range.popFront(); 989 990 assert(range.empty); 991 } 992 993 static if(compileInTests) unittest 994 { 995 import core.exception : AssertError; 996 import std.exception : enforce; 997 998 static void test(ER)(ref ER range, EntityType type, int row, int col, size_t line = __LINE__) 999 { 1000 enforce!AssertError(!range.empty, "unittest failure 1", __FILE__, line); 1001 enforce!AssertError(range.front.type == type, "unittest failure 2", __FILE__, line); 1002 enforce!AssertError(range.front.pos == TextPos(row, col), "unittest failure 3", __FILE__, line); 1003 range.popFront(); 1004 } 1005 1006 auto xml = "<?xml?>\n" ~ 1007 " <!--comment-->\n" ~ 1008 " <?pi?>\n" ~ 1009 " <root>\n" ~ 1010 " <!--comment--><!--comment-->\n" ~ 1011 " <?pi?>\n" ~ 1012 " <![CDATA[]]>\n" ~ 1013 " <empty/> </root>\n" ~ 1014 " <!--comment-->\n" ~ 1015 " <?pi?>\n"; 1016 1017 { 1018 auto range = parseXML(xml); 1019 test(range, EntityType.comment, 2, 4); 1020 test(range, EntityType.pi, 3, 4); 1021 test(range, EntityType.elementStart, 4, 2); 1022 test(range, EntityType.comment, 5, 11); 1023 test(range, EntityType.comment, 5, 25); 1024 test(range, EntityType.pi, 6, 8); 1025 test(range, EntityType.cdata, 7, 3); 1026 test(range, EntityType.elementEmpty, 8, 15); 1027 test(range, EntityType.elementEnd, 8, 28); 1028 test(range, EntityType.comment, 9, 2); 1029 test(range, EntityType.pi, 10, 2); 1030 } 1031 1032 auto range = parseXML!simpleXML(xml); 1033 test(range, EntityType.elementStart, 4, 2); 1034 test(range, EntityType.cdata, 7, 3); 1035 test(range, EntityType.elementStart, 8, 15); 1036 test(range, EntityType.elementEnd, 8, 15); 1037 test(range, EntityType.elementEnd, 8, 28); 1038 } 1039 1040 1041 /++ 1042 Gives the name of this Entity. 1043 1044 Note that this is the direct name in the XML for this entity and 1045 does not contain any of the names of any of the parent entities that 1046 this entity has. If an application wants the full "path" of the 1047 entity, then it will have to keep track of that itself. The parser 1048 does not do that as it would require allocating memory. 1049 1050 $(TABLE 1051 $(TR $(TH Supported $(LREF EntityType)s:)) 1052 $(TR $(TD $(LREF2 elementStart, EntityType))) 1053 $(TR $(TD $(LREF2 elementEnd, EntityType))) 1054 $(TR $(TD $(LREF2 elementEmpty, EntityType))) 1055 $(TR $(TD $(LREF2 pi, EntityType))) 1056 ) 1057 +/ 1058 @property SliceOfR name() 1059 { 1060 import dxml.internal : checkedSave, stripBCU; 1061 with(EntityType) 1062 { 1063 import std.format : format; 1064 assert(only(elementStart, elementEnd, elementEmpty, pi).canFind(_type), 1065 format("name cannot be called with %s", _type)); 1066 } 1067 return stripBCU!R(checkedSave(_name)); 1068 } 1069 1070 /// 1071 static if(compileInTests) unittest 1072 { 1073 auto xml = "<root>\n" ~ 1074 " <empty/>\n" ~ 1075 " <?pi?>\n" ~ 1076 "</root>"; 1077 1078 auto range = parseXML(xml); 1079 assert(range.front.type == EntityType.elementStart); 1080 assert(range.front.name == "root"); 1081 range.popFront(); 1082 1083 assert(range.front.type == EntityType.elementEmpty); 1084 assert(range.front.name == "empty"); 1085 range.popFront(); 1086 1087 assert(range.front.type == EntityType.pi); 1088 assert(range.front.name == "pi"); 1089 range.popFront(); 1090 1091 assert(range.front.type == EntityType.elementEnd); 1092 assert(range.front.name == "root"); 1093 range.popFront(); 1094 1095 assert(range.empty); 1096 } 1097 1098 1099 /++ 1100 Returns a lazy range of attributes for a start tag where each 1101 attribute is represented as a$(BR) 1102 $(D $(PHOBOS_REF_ALTTEXT Tuple, Tuple, std, typecons)!( 1103 $(LREF2 SliceOfR, EntityRange), $(D_STRING "name"), 1104 $(LREF2 SliceOfR, EntityRange), $(D_STRING "value"), 1105 $(LREF TextPos), $(D_STRING "pos"))). 1106 1107 $(TABLE 1108 $(TR $(TH Supported $(LREF EntityType)s:)) 1109 $(TR $(TD $(LREF2 elementStart, EntityType))) 1110 $(TR $(TD $(LREF2 elementEmpty, EntityType))) 1111 ) 1112 1113 See_Also: $(LREF2 Attribute, EntityRange.Entity)$(BR) 1114 $(REF decodeXML, dxml, util)$(BR) 1115 $(REF asDecodedXML, dxml, util) 1116 +/ 1117 @property auto attributes() 1118 { 1119 with(EntityType) 1120 { 1121 import std.format : format; 1122 assert(_type == elementStart || _type == elementEmpty, 1123 format("attributes cannot be called with %s", _type)); 1124 } 1125 1126 // STag ::= '<' Name (S Attribute)* S? '>' 1127 // Attribute ::= Name Eq AttValue 1128 // EmptyElemTag ::= '<' Name (S Attribute)* S? '/>' 1129 1130 static struct AttributeRange 1131 { 1132 @property Attribute front() 1133 { 1134 return _front; 1135 } 1136 1137 void popFront() 1138 { 1139 import dxml.internal : stripBCU; 1140 1141 stripWS(_text); 1142 if(_text.input.empty) 1143 { 1144 empty = true; 1145 return; 1146 } 1147 1148 immutable pos = _text.pos; 1149 auto name = stripBCU!R(_text.takeName!'='()); 1150 stripWS(_text); 1151 popFrontAndIncCol(_text); 1152 stripWS(_text); 1153 _front = Attribute(name, stripBCU!R(takeEnquotedText(_text)), pos); 1154 } 1155 1156 @property auto save() 1157 { 1158 import dxml.internal : checkedSave; 1159 auto retval = this; 1160 retval._front = Attribute(_front[0].save, checkedSave(_front[1]), _front[2]); 1161 retval._text.input = checkedSave(retval._text.input); 1162 return retval; 1163 } 1164 1165 this(typeof(_text) text) 1166 { 1167 _front = Attribute.init; // This is utterly stupid. https://issues.dlang.org/show_bug.cgi?id=13945 1168 _text = text; 1169 if(_text.input.empty) 1170 empty = true; 1171 else 1172 popFront(); 1173 } 1174 1175 bool empty; 1176 Attribute _front; 1177 typeof(_savedText) _text; 1178 } 1179 1180 return AttributeRange(_savedText.save); 1181 } 1182 1183 /// 1184 static if(compileInTests) unittest 1185 { 1186 import std.algorithm.comparison : equal; 1187 import std.algorithm.iteration : filter; 1188 { 1189 auto xml = "<root/>"; 1190 auto range = parseXML(xml); 1191 assert(range.front.type == EntityType.elementEmpty); 1192 assert(range.front.attributes.empty); 1193 1194 static assert(is(ElementType!(typeof(range.front.attributes)) == 1195 typeof(range).Entity.Attribute)); 1196 } 1197 { 1198 auto xml = "<root a='42' q='29' w='hello'/>"; 1199 auto range = parseXML(xml); 1200 assert(range.front.type == EntityType.elementEmpty); 1201 1202 auto attrs = range.front.attributes; 1203 assert(attrs.front.name == "a"); 1204 assert(attrs.front.value == "42"); 1205 assert(attrs.front.pos == TextPos(1, 7)); 1206 attrs.popFront(); 1207 1208 assert(attrs.front.name == "q"); 1209 assert(attrs.front.value == "29"); 1210 assert(attrs.front.pos == TextPos(1, 14)); 1211 attrs.popFront(); 1212 1213 assert(attrs.front.name == "w"); 1214 assert(attrs.front.value == "hello"); 1215 assert(attrs.front.pos == TextPos(1, 21)); 1216 attrs.popFront(); 1217 1218 assert(attrs.empty); 1219 } 1220 // Because the type of name and value is SliceOfR, == with a string 1221 // only works if the range passed to parseXML was string. 1222 { 1223 auto xml = filter!(a => true)("<root a='42' q='29' w='hello'/>"); 1224 auto range = parseXML(xml); 1225 assert(range.front.type == EntityType.elementEmpty); 1226 1227 auto attrs = range.front.attributes; 1228 assert(equal(attrs.front.name, "a")); 1229 assert(equal(attrs.front.value, "42")); 1230 assert(attrs.front.pos == TextPos(1, 7)); 1231 attrs.popFront(); 1232 1233 assert(equal(attrs.front.name, "q")); 1234 assert(equal(attrs.front.value, "29")); 1235 assert(attrs.front.pos == TextPos(1, 14)); 1236 attrs.popFront(); 1237 1238 assert(equal(attrs.front.name, "w")); 1239 assert(equal(attrs.front.value, "hello")); 1240 assert(attrs.front.pos == TextPos(1, 21)); 1241 attrs.popFront(); 1242 1243 assert(attrs.empty); 1244 } 1245 } 1246 1247 static if(compileInTests) unittest 1248 { 1249 import core.exception : AssertError; 1250 import std.algorithm.comparison : equal; 1251 import std.exception : assertNotThrown, collectException, enforce; 1252 import std.typecons : Tuple, tuple; 1253 import dxml.internal : codeLen, testRangeFuncs; 1254 1255 static bool cmpAttr(T, U)(T lhs, U rhs) 1256 { 1257 return equal(lhs[0].save, rhs[0].save) && 1258 equal(lhs[1].save, rhs[1].save); 1259 } 1260 1261 static void test(alias func, ThrowOnEntityRef toer)(string text, EntityType type, 1262 Tuple!(string, string)[] expected, 1263 int row, int col, size_t line = __LINE__) 1264 { 1265 auto range = assertNotThrown!XMLParsingException(parseXML!(makeConfig(toer))(func(text)), 1266 "unittest 1", __FILE__, line); 1267 enforce!AssertError(range.front.type == type, "unittest failure 2", __FILE__, line); 1268 enforce!AssertError(equal!cmpAttr(range.front.attributes, expected), 1269 "unittest failure 3", __FILE__, line); 1270 enforce!AssertError(range._text.pos == TextPos(row, col), "unittest failure 4", __FILE__, line); 1271 } 1272 1273 static void testFail(alias func, ThrowOnEntityRef toer)(string text, 1274 int row, int col, size_t line = __LINE__) 1275 { 1276 auto e = collectException!XMLParsingException(parseXML!(makeConfig(toer))(func(text))); 1277 enforce!AssertError(e !is null, "unittest failure 1", __FILE__, line); 1278 enforce!AssertError(e.pos == TextPos(row, col), "unittest failure 2", __FILE__, line); 1279 } 1280 1281 static foreach(func; testRangeFuncs) 1282 { 1283 static foreach(toer; [ThrowOnEntityRef.yes, ThrowOnEntityRef.no]) 1284 { 1285 test!(func, toer)("<root a='b'/>", EntityType.elementEmpty, [tuple("a", "b")], 1, 14); 1286 test!(func, toer)("<root a = 'b' />", EntityType.elementEmpty, [tuple("a", "b")], 1, 17); 1287 test!(func, toer)("<root \n\n a \n\n = \n\n 'b' \n\n />", EntityType.elementEmpty, 1288 [tuple("a", "b")], 9, 4); 1289 test!(func, toer)("<root a='b'></root>", EntityType.elementStart, [tuple("a", "b")], 1, 13); 1290 test!(func, toer)("<root a = 'b' ></root>", EntityType.elementStart, [tuple("a", "b")], 1, 16); 1291 test!(func, toer)("<root \n a \n = \n 'b' \n ></root>", EntityType.elementStart, 1292 [tuple("a", "b")], 5, 3); 1293 1294 test!(func, toer)("<root foo='\n\n\n'/>", EntityType.elementEmpty, [tuple("foo", "\n\n\n")], 4, 4); 1295 test!(func, toer)(`<root foo='"""'/>`, EntityType.elementEmpty, [tuple("foo", `"""`)], 1, 18); 1296 test!(func, toer)(`<root foo="'''"/>`, EntityType.elementEmpty, [tuple("foo", `'''`)], 1, 18); 1297 test!(func, toer)(`<root foo.=""/>`, EntityType.elementEmpty, [tuple("foo.", "")], 1, 16); 1298 test!(func, toer)(`<root foo="bar="/>`, EntityType.elementEmpty, [tuple("foo", "bar=")], 1, 19); 1299 1300 test!(func, toer)("<root foo='bar' a='b' hello='world'/>", EntityType.elementEmpty, 1301 [tuple("foo", "bar"), tuple("a", "b"), tuple("hello", "world")], 1, 38); 1302 test!(func, toer)(`<root foo="bar" a='b' hello="world"/>`, EntityType.elementEmpty, 1303 [tuple("foo", "bar"), tuple("a", "b"), tuple("hello", "world")], 1, 38); 1304 1305 test!(func, toer)(`<root foo="*" a='B' hello="%foo"/>`, EntityType.elementEmpty, 1306 [tuple("foo", "*"), tuple("a", "B"), tuple("hello", "%foo")], 1, 44); 1307 1308 test!(func, toer)(`<root foo="&" a='vector<int>'></root>`, EntityType.elementStart, 1309 [tuple("foo", "&"), tuple("a", "vector<int>"),], 1, 41); 1310 1311 test!(func, toer)(`<foo 京都市="ディラン"/>`, EntityType.elementEmpty, 1312 [tuple("京都市", "ディラン")], 1, codeLen!(func, `<foo 京都市="ディラン"/>`) + 1); 1313 1314 test!(func, toer)(`<root foo=">"/>`, EntityType.elementEmpty, [tuple("foo", ">")], 1, 16); 1315 test!(func, toer)(`<root foo=">>>>>>"/>`, EntityType.elementEmpty, [tuple("foo", ">>>>>>")], 1, 21); 1316 test!(func, toer)(`<root foo=">"></root>`, EntityType.elementStart, [tuple("foo", ">")], 1, 15); 1317 test!(func, toer)(`<root foo=">>>>>>"></root>`, EntityType.elementStart, [tuple("foo", ">>>>>>")], 1, 20); 1318 1319 test!(func, toer)(`<root foo="bar" foos="ball"/>`, EntityType.elementEmpty, 1320 [tuple("foo", "bar"), tuple("foos", "ball")], 1, 30); 1321 1322 testFail!(func, toer)(`<root a="""/>`, 1, 11); 1323 testFail!(func, toer)(`<root a='''/>`, 1, 11); 1324 testFail!(func, toer)("<root a=/>", 1, 9); 1325 testFail!(func, toer)("<root a='/>", 1, 9); 1326 testFail!(func, toer)("<root a='/>", 1, 9); 1327 testFail!(func, toer)("<root =''/>", 1, 7); 1328 testFail!(func, toer)(`<root a ""/>`, 1, 9); 1329 testFail!(func, toer)(`<root a""/>`, 1, 8); 1330 testFail!(func, toer)(`<root a/>`, 1, 8); 1331 testFail!(func, toer)("<root foo='bar' a=/>", 1, 19); 1332 testFail!(func, toer)("<root foo='bar' a='/>", 1, 19); 1333 testFail!(func, toer)("<root foo='bar' a='/>", 1, 19); 1334 testFail!(func, toer)("<root foo='bar' =''/>", 1, 17); 1335 testFail!(func, toer)("<root foo='bar' a= hello='world'/>", 1, 20); 1336 // It's 33 rather than 28, because it throws when processing the start tag and not when processing 1337 // the attributes. So, the mismatched quotes are detected before the attributes are checked. 1338 testFail!(func, toer)("<root foo='bar' a=' hello='world'/>", 1, 33); 1339 testFail!(func, toer)("<root foo='bar' ='' hello='world'/>", 1, 17); 1340 testFail!(func, toer)("<root foo='bar'a='b'/>", 1, 16); 1341 testFail!(func, toer)(`<root .foo="bar"/>`, 1, 7); 1342 1343 testFail!(func, toer)(`<root foo="<"/>`, 1, 12); 1344 testFail!(func, toer)(`<root foo="<world"/>`, 1, 12); 1345 testFail!(func, toer)(`<root foo="hello<world"/>`, 1, 17); 1346 testFail!(func, toer)(`<root foo="&"/>`, 1, 12); 1347 testFail!(func, toer)(`<root foo="hello&"/>`, 1, 17); 1348 testFail!(func, toer)(`<root foo="hello&world"/>`, 1, 17); 1349 testFail!(func, toer)(`<root foo="&;"/>`, 1, 12); 1350 testFail!(func, toer)(`<root foo="&#;"/>`, 1, 12); 1351 testFail!(func, toer)(`<root foo="&#x;"/>`, 1, 12); 1352 testFail!(func, toer)(`<root foo="&#A;"/>`, 1, 12); 1353 testFail!(func, toer)(`<root foo="&#xG;"/>`, 1, 12); 1354 testFail!(func, toer)(`<root foo="*"/>`, 1, 12); 1355 testFail!(func, toer)(`<root foo="B"/>`, 1, 12); 1356 testFail!(func, toer)(`<root foo=""/>`, 1, 12); 1357 1358 testFail!(func, toer)("<root\n\nfoo='\nbarB'></root>", 4, 4); 1359 1360 testFail!(func, toer)(`<root a="""></root>`, 1, 11); 1361 testFail!(func, toer)(`<root a='''></root>`, 1, 11); 1362 testFail!(func, toer)("<root a=></root>", 1, 9); 1363 testFail!(func, toer)("<root a='></root>", 1, 9); 1364 testFail!(func, toer)("<root a='></root>", 1, 9); 1365 testFail!(func, toer)("<root =''></root>", 1, 7); 1366 testFail!(func, toer)(`<root a ""></root>`, 1, 9); 1367 testFail!(func, toer)(`<root a""></root>`, 1, 8); 1368 testFail!(func, toer)(`<root a></root>`, 1, 8); 1369 testFail!(func, toer)("<root foo='bar' a=></root>", 1, 19); 1370 testFail!(func, toer)("<root foo='bar' a='></root>", 1, 19); 1371 testFail!(func, toer)("<root foo='bar' a='></root>", 1, 19); 1372 testFail!(func, toer)("<root foo='bar' =''></root>", 1, 17); 1373 testFail!(func, toer)("<root foo='bar' a= hello='world'></root>", 1, 20); 1374 testFail!(func, toer)("<root foo='bar' a=' hello='world'></root>", 1, 33); 1375 testFail!(func, toer)("<root foo='bar' ='' hello='world'></root>", 1, 17); 1376 testFail!(func, toer)("<root foo='bar'a='b'></root>", 1, 16); 1377 testFail!(func, toer)(`<root .foo='bar'></root>`, 1, 7); 1378 1379 testFail!(func, toer)(`<root foo="<"></root>`, 1, 12); 1380 testFail!(func, toer)(`<root foo="<world"></root>`, 1, 12); 1381 testFail!(func, toer)(`<root foo="hello<world"></root>`, 1, 17); 1382 testFail!(func, toer)(`<root foo="&"></root>`, 1, 12); 1383 testFail!(func, toer)(`<root foo="hello&"></root>`, 1, 17); 1384 testFail!(func, toer)(`<root foo="hello&world"></root>`, 1, 17); 1385 testFail!(func, toer)(`<root foo="&;"></root>`, 1, 12); 1386 testFail!(func, toer)(`<root foo="&#;"></root>`, 1, 12); 1387 testFail!(func, toer)(`<root foo="&#x;"></root>`, 1, 12); 1388 testFail!(func, toer)(`<root foo="&#A;"></root>`, 1, 12); 1389 testFail!(func, toer)(`<root foo="&#xG;"></root>`, 1, 12); 1390 testFail!(func, toer)(`<root foo="*"></root>`, 1, 12); 1391 testFail!(func, toer)(`<root foo="B"></root>`, 1, 12); 1392 testFail!(func, toer)(`<root foo=""></root>`, 1, 12); 1393 1394 testFail!(func, toer)(`<root a='42' a='19'/>`, 1, 14); 1395 testFail!(func, toer)(`<root a='42' b='hello' a='19'/>`, 1, 24); 1396 testFail!(func, toer)(`<root a='42' b='hello' a='19' c=''/>`, 1, 24); 1397 testFail!(func, toer)(`<root a='' b='' c='' d='' e='' f='' g='' e='' h=''/>`, 1, 42); 1398 testFail!(func, toer)(`<root foo='bar' foo='bar'/>`, 1, 17); 1399 1400 test!(func, toer)(`<root foo="&"></root>`, EntityType.elementStart, 1401 [tuple("foo", "&")], 1, 19); 1402 test!(func, toer)(`<root foo="foo&<>'"bar"></root>`, EntityType.elementStart, 1403 [tuple("foo", "foo&<>'"bar")], 1, 45); 1404 testFail!(func, toer)("<root foo='&;'></root>", 1, 12); 1405 testFail!(func, toer)("<root foo='&.;'></root>", 1, 12); 1406 testFail!(func, toer)("<root foo='\n & ule'></root>", 2, 2); 1407 testFail!(func, toer)("<root foo='\n &foo bar'></root>", 2, 2); 1408 } 1409 { 1410 alias toer = ThrowOnEntityRef.yes; 1411 testFail!(func, toer)(`<root foo="&foo;"/>`, 1, 12); 1412 testFail!(func, toer)(`<root foo="&foo;"></root>`, 1, 12); 1413 testFail!(func, toer)("<root foo='foo&bar.;'></root>", 1, 15); 1414 testFail!(func, toer)(`<root foo="hello &a; world"></root>`, 1, 18); 1415 testFail!(func, toer)("<root foo='hello \n &a; \n world'></root>", 2, 2); 1416 } 1417 { 1418 alias toer = ThrowOnEntityRef.no; 1419 test!(func, toer)(`<root foo="&foo;"/>`, EntityType.elementEmpty, 1420 [tuple("foo", "&foo;")], 1, 20); 1421 test!(func, toer)(`<root foo="&foo;"></root>`, EntityType.elementStart, 1422 [tuple("foo", "&foo;")], 1, 19); 1423 test!(func, toer)("<root foo='foo&bar.;'></root>", EntityType.elementStart, 1424 [tuple("foo", "foo&bar.;")], 1, 23); 1425 test!(func, toer)(`<root foo="hello &a; world"></root>`, EntityType.elementStart, 1426 [tuple("foo", "hello &a; world")], 1, 29); 1427 test!(func, toer)("<root foo='hello \n &a; \n world'></root>", EntityType.elementStart, 1428 [tuple("foo", "hello \n &a; \n world")], 3, 9); 1429 } 1430 } 1431 } 1432 1433 1434 /++ 1435 Returns the textual value of this Entity. 1436 1437 In the case of $(LREF EntityType.pi), this is the 1438 text that follows the name, whereas in the other cases, the text is 1439 the entire contents of the entity (save for the delimeters on the 1440 ends if that entity has them). 1441 1442 $(TABLE 1443 $(TR $(TH Supported $(LREF EntityType)s:)) 1444 $(TR $(TD $(LREF2 cdata, EntityType))) 1445 $(TR $(TD $(LREF2 comment, EntityType))) 1446 $(TR $(TD $(LREF2 pi, EntityType))) 1447 $(TR $(TD $(LREF2 _text, EntityType))) 1448 ) 1449 1450 See_Also: $(REF decodeXML, dxml, util)$(BR) 1451 $(REF asDecodedXML, dxml, util)$(BR) 1452 $(REF stripIndent, dxml, util)$(BR) 1453 $(REF withoutIndent, dxml, util) 1454 +/ 1455 @property SliceOfR text() 1456 { 1457 import dxml.internal : checkedSave, stripBCU; 1458 with(EntityType) 1459 { 1460 import std.format : format; 1461 assert(only(cdata, comment, pi, text).canFind(_type), 1462 format("text cannot be called with %s", _type)); 1463 } 1464 return stripBCU!R(checkedSave(_savedText.input)); 1465 } 1466 1467 /// 1468 static if(compileInTests) unittest 1469 { 1470 import std.range.primitives : empty; 1471 1472 auto xml = "<?xml version='1.0'?>\n" ~ 1473 "<?instructionName?>\n" ~ 1474 "<?foo here is something to say?>\n" ~ 1475 "<root>\n" ~ 1476 " <![CDATA[ Yay! random text >> << ]]>\n" ~ 1477 " <!-- some random comment -->\n" ~ 1478 " <p>something here</p>\n" ~ 1479 " <p>\n" ~ 1480 " something else\n" ~ 1481 " here</p>\n" ~ 1482 "</root>"; 1483 auto range = parseXML(xml); 1484 1485 // "<?instructionName?>\n" ~ 1486 assert(range.front.type == EntityType.pi); 1487 assert(range.front.name == "instructionName"); 1488 assert(range.front.text.empty); 1489 1490 // "<?foo here is something to say?>\n" ~ 1491 range.popFront(); 1492 assert(range.front.type == EntityType.pi); 1493 assert(range.front.name == "foo"); 1494 assert(range.front.text == "here is something to say"); 1495 1496 // "<root>\n" ~ 1497 range.popFront(); 1498 assert(range.front.type == EntityType.elementStart); 1499 1500 // " <![CDATA[ Yay! random text >> << ]]>\n" ~ 1501 range.popFront(); 1502 assert(range.front.type == EntityType.cdata); 1503 assert(range.front.text == " Yay! random text >> << "); 1504 1505 // " <!-- some random comment -->\n" ~ 1506 range.popFront(); 1507 assert(range.front.type == EntityType.comment); 1508 assert(range.front.text == " some random comment "); 1509 1510 // " <p>something here</p>\n" ~ 1511 range.popFront(); 1512 assert(range.front.type == EntityType.elementStart); 1513 assert(range.front.name == "p"); 1514 1515 range.popFront(); 1516 assert(range.front.type == EntityType.text); 1517 assert(range.front.text == "something here"); 1518 1519 range.popFront(); 1520 assert(range.front.type == EntityType.elementEnd); 1521 assert(range.front.name == "p"); 1522 1523 // " <p>\n" ~ 1524 // " something else\n" ~ 1525 // " here</p>\n" ~ 1526 range.popFront(); 1527 assert(range.front.type == EntityType.elementStart); 1528 1529 range.popFront(); 1530 assert(range.front.type == EntityType.text); 1531 assert(range.front.text == "\n something else\n here"); 1532 1533 range.popFront(); 1534 assert(range.front.type == EntityType.elementEnd); 1535 1536 // "</root>" 1537 range.popFront(); 1538 assert(range.front.type == EntityType.elementEnd); 1539 1540 range.popFront(); 1541 assert(range.empty); 1542 } 1543 1544 1545 // Reduce the chance of bugs if reference-type ranges are involved. 1546 static if(!isDynamicArray!R) this(this) 1547 { 1548 with(EntityType) final switch(_type) 1549 { 1550 case cdata: break; 1551 case comment: break; 1552 case elementStart: 1553 { 1554 _name = _name.save; 1555 break; 1556 } 1557 case elementEnd: goto case elementStart; 1558 case elementEmpty: goto case elementStart; 1559 case text: break; 1560 case pi: goto case elementStart; 1561 } 1562 1563 if(_type != EntityType.elementEnd) 1564 _savedText = _savedText.save; 1565 } 1566 1567 static if(compileInTests) unittest 1568 { 1569 import std.algorithm.comparison : equal; 1570 import dxml.internal : testRangeFuncs; 1571 1572 static bool cmpAttr(T)(T lhs, T rhs) 1573 { 1574 return equal(lhs.name.save, rhs.name.save) && 1575 equal(lhs.value.save, rhs.value.save); 1576 } 1577 1578 { 1579 auto xml = "<root>\n" ~ 1580 " <foo a='42'/>\n" ~ 1581 " <foo b='42'/>\n" ~ 1582 " <nocomment>nothing to say</nocomment>\n" ~ 1583 "</root>"; 1584 1585 // The duplicate lines aren't typos. We want to ensure that the 1586 // values are independent and that nothing was consumed. 1587 static foreach(func; testRangeFuncs) 1588 {{ 1589 auto range = parseXML(func(xml)); 1590 range.popFront(); 1591 { 1592 auto entity = range.front; 1593 auto entity2 = entity; 1594 assert(entity.pos == entity2.pos); 1595 assert(equal(entity.name, entity2.name)); 1596 assert(equal(entity.name, entity2.name)); 1597 assert(equal!cmpAttr(entity.attributes, entity2.attributes)); 1598 assert(equal!cmpAttr(entity.attributes, entity2.attributes)); 1599 range.popFront(); 1600 assert(entity.pos == entity2.pos); 1601 assert(entity.pos != range.front.pos); 1602 } 1603 range.popFront(); 1604 range.popFront(); 1605 { 1606 auto entity = range.front; 1607 auto entity2 = entity; 1608 assert(entity.pos == entity2.pos); 1609 assert(equal(entity.text, entity2.text)); 1610 assert(equal(entity.text, entity2.text)); 1611 range.popFront(); 1612 assert(entity.pos == entity2.pos); 1613 assert(entity.pos != range.front.pos); 1614 } 1615 }} 1616 } 1617 { 1618 auto xml = "<root>\n" ~ 1619 " <![CDATA[whatever]]>\n" ~ 1620 " <?pi?>\n" ~ 1621 " <!--comment-->\n" ~ 1622 " <empty/>\n" ~ 1623 " <noend a='foo' b='bar'/>\n" ~ 1624 " <foo baz='42'></foo>\n" ~ 1625 "</root>"; 1626 1627 static foreach(func; testRangeFuncs) 1628 { 1629 for(auto range = parseXML(func(xml)); !range.empty; range.popFront()) 1630 { 1631 auto entity = range.front; 1632 auto entity2 = entity; 1633 1634 assert(entity.pos == range.front.pos); 1635 assert(entity.pos == entity2.pos); 1636 assert(entity.type == range.front.type); 1637 assert(entity.type == entity2.type); 1638 1639 with(EntityType) final switch(entity.type) 1640 { 1641 case cdata: goto case text; 1642 case comment: goto case text; 1643 case elementStart: 1644 { 1645 assert(equal!cmpAttr(entity.attributes, range.front.attributes)); 1646 assert(equal!cmpAttr(entity.attributes, entity2.attributes)); 1647 goto case elementEnd; 1648 } 1649 case elementEnd: 1650 { 1651 assert(equal(entity.name, range.front.name)); 1652 assert(equal(entity.name, entity2.name)); 1653 break; 1654 } 1655 case elementEmpty: goto case elementStart; 1656 case text: 1657 { 1658 assert(equal(entity.text, range.front.text)); 1659 assert(equal(entity.text, entity2.text)); 1660 break; 1661 } 1662 case pi: 1663 { 1664 assert(equal(entity.name, range.front.name)); 1665 assert(equal(entity.name, entity2.name)); 1666 goto case text; 1667 } 1668 } 1669 } 1670 } 1671 } 1672 } 1673 1674 1675 private: 1676 1677 this(EntityType type) 1678 { 1679 _type = type; 1680 1681 // None of these initializations should be required. https://issues.dlang.org/show_bug.cgi?id=13945 1682 _name = typeof(_name).init; 1683 _savedText = typeof(_savedText).init; 1684 } 1685 1686 EntityType _type; 1687 TextPos _pos; 1688 Taken _name; 1689 typeof(EntityRange._savedText) _savedText; 1690 } 1691 1692 1693 /++ 1694 Returns the $(LREF Entity) representing the entity in the XML document 1695 which was most recently parsed. 1696 +/ 1697 @property Entity front() 1698 { 1699 auto retval = Entity(_type); 1700 with(EntityType) final switch(_type) 1701 { 1702 case cdata: retval._savedText = _savedText.save; break; 1703 case comment: goto case cdata; 1704 case elementStart: retval._name = _name.save; retval._savedText = _savedText.save; break; 1705 case elementEnd: retval._name = _name.save; break; 1706 case elementEmpty: goto case elementStart; 1707 case text: goto case cdata; 1708 case pi: goto case elementStart; 1709 } 1710 retval._pos = _entityPos; 1711 return retval; 1712 } 1713 1714 1715 /++ 1716 Move to the next entity. 1717 1718 The next entity is the next one that is linearly in the XML document. 1719 So, if the current entity has child entities, the next entity will be 1720 the first child entity, whereas if it has no child entities, it will be 1721 the next entity at the same level. 1722 1723 Throws: $(LREF XMLParsingException) on invalid XML. 1724 +/ 1725 void popFront() 1726 { 1727 final switch(_grammarPos) with(GrammarPos) 1728 { 1729 case documentStart: _parseDocumentStart(); break; 1730 case prologMisc1: _parseAtPrologMisc!1(); break; 1731 case prologMisc2: _parseAtPrologMisc!2(); break; 1732 case splittingEmpty: 1733 { 1734 _type = EntityType.elementEnd; 1735 _tagStack.sawEntity(); 1736 _grammarPos = _tagStack.depth == 0 ? GrammarPos.endMisc : GrammarPos.contentCharData2; 1737 break; 1738 } 1739 case contentCharData1: 1740 { 1741 assert(_type == EntityType.elementStart); 1742 _tagStack.pushTag(_name.save); 1743 _parseAtContentCharData(); 1744 break; 1745 } 1746 case contentMid: _parseAtContentMid(); break; 1747 case contentCharData2: _parseAtContentCharData(); break; 1748 case endTag: _parseElementEnd(); break; 1749 case endMisc: _parseAtEndMisc(); break; 1750 case documentEnd: assert(0, "It's illegal to call popFront() on an empty EntityRange."); 1751 } 1752 } 1753 1754 1755 /++ 1756 Whether the end of the XML document has been reached. 1757 1758 Note that because an $(LREF XMLParsingException) will be thrown an 1759 invalid XML, it's actually possible to call 1760 $(LREF2 front, EntityRange) and $(LREF2 popFront, EntityRange) without 1761 checking empty if the only way that empty would be true is if the XML 1762 were invalid (e.g. if at a start tag, it's a given that there's at 1763 least one end tag left in the document unless it's invalid XML). 1764 1765 However, of course, caution should be used to ensure that incorrect 1766 assumptions are not made that allow the document to reach its end 1767 earlier than predicted without throwing an $(LREF XMLParsingException), 1768 since it's still an error to call $(LREF2 front, EntityRange) or 1769 $(LREF2 popFront, EntityRange) if empty would return false. 1770 +/ 1771 @property bool empty() @safe const pure nothrow @nogc 1772 { 1773 return _grammarPos == GrammarPos.documentEnd; 1774 } 1775 1776 1777 /++ 1778 Forward range function for obtaining a copy of the range which can then 1779 be iterated independently of the original. 1780 +/ 1781 @property auto save() 1782 { 1783 // The init check nonsense is because of ranges whose init values blow 1784 // up when save is called (e.g. a range that's a class). 1785 auto retval = this; 1786 if(retval._name !is typeof(retval._name).init) 1787 retval._name = _name.save; 1788 if(retval._text.input !is typeof(retval._text.input).init) 1789 retval._text.input = _text.input.save; 1790 if(retval._savedText.input !is typeof(retval._savedText.input).init) 1791 retval._savedText.input = _savedText.input.save; 1792 return retval; 1793 } 1794 1795 static if(compileInTests) unittest 1796 { 1797 import std.algorithm.comparison : equal; 1798 import std.exception : assertNotThrown; 1799 import dxml.internal : testRangeFuncs; 1800 1801 static bool cmpAttr(T)(T lhs, T rhs) 1802 { 1803 return equal(lhs.name.save, rhs.name.save) && 1804 equal(lhs.value.save, rhs.value.save); 1805 } 1806 1807 static void testEqual(ER)(ER one, ER two) 1808 { 1809 while(!one.empty && !two.empty) 1810 { 1811 auto left = one.front; 1812 auto right = two.front; 1813 1814 assert(left.pos == right.pos); 1815 assert(left.type == right.type); 1816 1817 with(EntityType) final switch(left.type) 1818 { 1819 case cdata: goto case text; 1820 case comment: goto case text; 1821 case elementStart: 1822 { 1823 assert(equal!cmpAttr(left.attributes, right.attributes)); 1824 goto case elementEnd; 1825 } 1826 case elementEnd: assert(equal(left.name, right.name)); break; 1827 case elementEmpty: goto case elementStart; 1828 case text: assert(equal(left.text, right.text)); break; 1829 case pi: assert(equal(left.name, right.name)); goto case text; 1830 } 1831 1832 one.popFront(); 1833 two.popFront(); 1834 } 1835 1836 assert(one.empty); 1837 assert(two.empty); 1838 } 1839 1840 auto xml = "<root>\n" ~ 1841 " <!-- comment -->\n" ~ 1842 " <something>\n" ~ 1843 " <else/>\n" ~ 1844 " somet text <i>goes</i> here\n" ~ 1845 " </something>\n" ~ 1846 "</root>"; 1847 1848 static foreach(i, func; testRangeFuncs) 1849 {{ 1850 auto text = func(xml); 1851 testEqual(parseXML(text.save), parseXML(text.save)); 1852 auto range = parseXML(text.save); 1853 testEqual(range.save, range.save); 1854 }} 1855 } 1856 1857 1858 /++ 1859 Returns an empty range. This corresponds to 1860 $(PHOBOS_REF _takeNone, std, range) except that it doesn't create a 1861 wrapper type. 1862 +/ 1863 EntityRange takeNone() 1864 { 1865 auto retval = save; 1866 retval._grammarPos = GrammarPos.documentEnd; 1867 return retval; 1868 } 1869 1870 1871 private: 1872 1873 void _parseDocumentStart() 1874 { 1875 auto orig = _text.save; 1876 immutable wasWS = _text.stripWS(); 1877 if(_text.stripStartsWith("<?xml")) 1878 { 1879 if(wasWS) 1880 throw new XMLParsingException("Cannot have whitespace before the <?xml...?> declaration", TextPos.init); 1881 checkNotEmpty(_text); 1882 if(_text.input.front == '?' || isSpace(_text.input.front)) 1883 _text.skipUntilAndDrop!"?>"(); 1884 else 1885 _text = orig; 1886 } 1887 _grammarPos = GrammarPos.prologMisc1; 1888 _parseAtPrologMisc!1(); 1889 } 1890 1891 static if(compileInTests) unittest 1892 { 1893 import core.exception : AssertError; 1894 import std.exception : assertNotThrown, enforce; 1895 import dxml.internal : testRangeFuncs; 1896 1897 static void test(alias func)(string xml, int row, int col, size_t line = __LINE__) 1898 { 1899 auto range = assertNotThrown!XMLParsingException(parseXML(func(xml))); 1900 enforce!AssertError(range._type == EntityType.elementEmpty, "unittest failure 1", __FILE__, line); 1901 enforce!AssertError(range._text.pos == TextPos(row, col), "unittest failure 2", __FILE__, line); 1902 } 1903 1904 static foreach(func; testRangeFuncs) 1905 { 1906 test!func("<root/>", 1, 8); 1907 test!func("\n\t\n <root/> \n", 3, 9); 1908 test!func("<?xml\n\n\nversion='1.8'\n\n\n\nencoding='UTF-8'\n\n\nstandalone='yes'\n?><root/>", 12, 10); 1909 test!func("<?xml\n\n\n \r\r\r\n\nversion='1.8'?><root/>", 6, 23); 1910 test!func("<?xml\n\n\n \r\r\r\n\nversion='1.8'?>\n <root/>", 7, 13); 1911 test!func("<root/>", 1, 8); 1912 test!func("\n\t\n <root/> \n", 3, 9); 1913 } 1914 } 1915 1916 1917 // Parse at GrammarPos.prologMisc1 or GrammarPos.prologMisc2. 1918 void _parseAtPrologMisc(int miscNum)() 1919 { 1920 static assert(miscNum == 1 || miscNum == 2); 1921 1922 // document ::= prolog element Misc* 1923 // prolog ::= XMLDecl? Misc* (doctypedecl Misc*)? 1924 // Misc ::= Comment | PI | S 1925 1926 stripWS(_text); 1927 checkNotEmpty(_text); 1928 if(_text.input.front != '<') 1929 throw new XMLParsingException("Expected <", _text.pos); 1930 popFrontAndIncCol(_text); 1931 checkNotEmpty(_text); 1932 1933 switch(_text.input.front) 1934 { 1935 // Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->' 1936 // doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S? ('[' intSubset ']' S?)? '>' 1937 case '!': 1938 { 1939 immutable bangPos = _text.pos; 1940 popFrontAndIncCol(_text); 1941 if(_text.stripStartsWith("--")) 1942 { 1943 _parseComment(); 1944 static if(config.skipComments == SkipComments.yes) 1945 _parseAtPrologMisc!miscNum(); 1946 break; 1947 } 1948 static if(miscNum == 1) 1949 { 1950 if(_text.stripStartsWith("DOCTYPE")) 1951 { 1952 if(!_text.stripWS()) 1953 throw new XMLParsingException("Whitespace must follow <!DOCTYPE", _text.pos); 1954 _parseDoctypeDecl(); 1955 break; 1956 } 1957 throw new XMLParsingException("Expected Comment or DOCTYPE section", bangPos); 1958 } 1959 else 1960 { 1961 if(_text.stripStartsWith("DOCTYPE")) 1962 { 1963 throw new XMLParsingException("Only one <!DOCTYPE ...> declaration allowed per XML document", 1964 bangPos); 1965 } 1966 throw new XMLParsingException("Expected Comment", bangPos); 1967 } 1968 } 1969 // PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>' 1970 case '?': 1971 { 1972 _parsePI(); 1973 static if(config.skipPI == SkipPI.yes) 1974 popFront(); 1975 break; 1976 } 1977 // element ::= EmptyElemTag | STag content ETag 1978 default: 1979 { 1980 _parseElementStart(); 1981 break; 1982 } 1983 } 1984 } 1985 1986 1987 // Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->' 1988 // Parses a comment. <!-- was already removed from the front of the input. 1989 void _parseComment() 1990 { 1991 static if(config.skipComments == SkipComments.yes) 1992 _text.skipUntilAndDrop!"--"(); 1993 else 1994 { 1995 _entityPos = TextPos(_text.pos.line, _text.pos.col - 4); 1996 _type = EntityType.comment; 1997 _tagStack.sawEntity(); 1998 _savedText.pos = _text.pos; 1999 _savedText.input = _text.takeUntilAndDrop!"--"(); 2000 } 2001 if(_text.input.empty || _text.input.front != '>') 2002 throw new XMLParsingException("Comments cannot contain -- and cannot be terminated by --->", _text.pos); 2003 // This is here rather than at the end of the previous static if block 2004 // so that the error message for improperly terminating a comment takes 2005 // precedence over the one involving invalid characters in the comment. 2006 static if(config.skipComments == SkipComments.no) 2007 checkText!true(_savedText); 2008 popFrontAndIncCol(_text); 2009 } 2010 2011 static if(compileInTests) unittest 2012 { 2013 import core.exception : AssertError; 2014 import std.algorithm.comparison : equal; 2015 import std.exception : assertNotThrown, assertThrown, collectException, enforce; 2016 import dxml.internal : codeLen, testRangeFuncs; 2017 2018 static void test(alias func)(string text, string expected, int row, int col, size_t line = __LINE__) 2019 { 2020 auto range = assertNotThrown!XMLParsingException(parseXML(func(text ~ "<root/>"))); 2021 enforce!AssertError(range.front.type == EntityType.comment, "unittest failure 1", __FILE__, line); 2022 enforce!AssertError(equal(range.front.text, expected), "unittest failure 2", __FILE__, line); 2023 enforce!AssertError(range._text.pos == TextPos(row, col), "unittest failure 3", __FILE__, line); 2024 } 2025 2026 static void testFail(alias func)(string text, int row, int col, size_t line = __LINE__) 2027 { 2028 auto e = collectException!XMLParsingException(parseXML(func(text ~ "<root/>"))); 2029 enforce!AssertError(e !is null, "unittest failure 1", __FILE__, line); 2030 enforce!AssertError(e.pos == TextPos(row, col), "unittest failure 2", __FILE__, line); 2031 } 2032 2033 static foreach(func; testRangeFuncs) 2034 { 2035 test!func("<!--foo-->", "foo", 1, 11); 2036 test!func("<!-- foo -->", " foo ", 1, 13); 2037 test!func("<!-- -->", " ", 1, 9); 2038 test!func("<!---->", "", 1, 8); 2039 test!func("<!--- comment -->", "- comment ", 1, 18); 2040 test!func("<!-- \n foo \n -->", " \n foo \n ", 3, 5); 2041 test!func("<!--京都市 ディラン-->", "京都市 ディラン", 1, codeLen!(func, "<!--京都市 ディラン-->") + 1); 2042 test!func("<!--&-->", "&", 1, 9); 2043 test!func("<!--<-->", "<", 1, 9); 2044 test!func("<!-->-->", ">", 1, 9); 2045 test!func("<!--->-->", "->", 1, 10); 2046 2047 testFail!func("<!", 1, 2); 2048 testFail!func("<!- comment -->", 1, 2); 2049 testFail!func("<!-- comment ->", 1, 5); 2050 testFail!func("<!-- comment --->", 1, 16); 2051 testFail!func("<!---- comment -->", 1, 7); 2052 testFail!func("<!-- comment -- comment -->", 1, 16); 2053 testFail!func("<!->", 1, 2); 2054 testFail!func("<!-->", 1, 5); 2055 testFail!func("<!--->", 1, 5); 2056 testFail!func("<!----->", 1, 7); 2057 testFail!func("<!blah>", 1, 2); 2058 testFail!func("<! blah>", 1, 2); 2059 testFail!func("<!-- \n\n \v \n -->", 3, 4); 2060 testFail!func("<!--京都市 ディラン\v-->", 1, codeLen!(func, "<!--京都市 ディラン\v")); 2061 2062 { 2063 auto xml = func("<!DOCTYPE foo><!-- comment --><root/>"); 2064 auto range = assertNotThrown!XMLParsingException(parseXML(xml)); 2065 assert(range.front.type == EntityType.comment); 2066 assert(equal(range.front.text, " comment ")); 2067 } 2068 { 2069 auto xml = func("<root><!-- comment --></root>"); 2070 auto range = assertNotThrown!XMLParsingException(parseXML(xml)); 2071 assertNotThrown!XMLParsingException(range.popFront()); 2072 assert(range.front.type == EntityType.comment); 2073 assert(equal(range.front.text, " comment ")); 2074 } 2075 { 2076 auto xml = func("<root/><!-- comment -->"); 2077 auto range = assertNotThrown!XMLParsingException(parseXML(xml)); 2078 assertNotThrown!XMLParsingException(range.popFront()); 2079 assert(range.front.type == EntityType.comment); 2080 assert(equal(range.front.text, " comment ")); 2081 } 2082 2083 static foreach(comment; ["<!foo>", "<! foo>", "<!->", "<!-->", "<!--->"]) 2084 { 2085 { 2086 auto xml = func("<!DOCTYPE foo>" ~ comment ~ "<root/>"); 2087 assertThrown!XMLParsingException(parseXML(xml)); 2088 } 2089 { 2090 auto xml = func("<root>" ~ comment ~ "<root>"); 2091 auto range = assertNotThrown!XMLParsingException(parseXML(xml)); 2092 assertThrown!XMLParsingException(range.popFront()); 2093 } 2094 { 2095 auto xml = func("<root/>" ~ comment); 2096 auto range = assertNotThrown!XMLParsingException(parseXML(xml)); 2097 assertThrown!XMLParsingException(range.popFront()); 2098 } 2099 } 2100 2101 { 2102 auto xml = "<!--one-->\n" ~ 2103 "<!--two-->\n" ~ 2104 "<root>\n" ~ 2105 " <!--three-->\n" ~ 2106 " <!--four-->\n" ~ 2107 "</root>\n" ~ 2108 "<!--five-->\n" ~ 2109 "<!--six-->"; 2110 2111 auto text = func(xml); 2112 { 2113 auto range = parseXML(text.save); 2114 assert(range.front.type == EntityType.comment); 2115 assert(equal(range.front.text, "one")); 2116 assertNotThrown!XMLParsingException(range.popFront()); 2117 assert(range.front.type == EntityType.comment); 2118 assert(equal(range.front.text, "two")); 2119 assertNotThrown!XMLParsingException(range.popFront()); 2120 assert(range.front.type == EntityType.elementStart); 2121 assert(equal(range.front.name, "root")); 2122 assertNotThrown!XMLParsingException(range.popFront()); 2123 assert(range.front.type == EntityType.comment); 2124 assert(equal(range.front.text, "three")); 2125 assertNotThrown!XMLParsingException(range.popFront()); 2126 assert(range.front.type == EntityType.comment); 2127 assert(equal(range.front.text, "four")); 2128 assertNotThrown!XMLParsingException(range.popFront()); 2129 assert(range.front.type == EntityType.elementEnd); 2130 assert(equal(range.front.name, "root")); 2131 assertNotThrown!XMLParsingException(range.popFront()); 2132 assert(range.front.type == EntityType.comment); 2133 assert(equal(range.front.text, "five")); 2134 assertNotThrown!XMLParsingException(range.popFront()); 2135 assert(range.front.type == EntityType.comment); 2136 assert(equal(range.front.text, "six")); 2137 assertNotThrown!XMLParsingException(range.popFront()); 2138 assert(range.empty); 2139 } 2140 { 2141 auto range = parseXML!simpleXML(text.save); 2142 assert(range.front.type == EntityType.elementStart); 2143 assert(equal(range.front.name, "root")); 2144 assertNotThrown!XMLParsingException(range.popFront()); 2145 assert(range.front.type == EntityType.elementEnd); 2146 assert(equal(range.front.name, "root")); 2147 assertNotThrown!XMLParsingException(range.popFront()); 2148 assert(range.empty); 2149 } 2150 } 2151 } 2152 } 2153 2154 2155 // PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>' 2156 // PITarget ::= Name - (('X' | 'x') ('M' | 'm') ('L' | 'l')) 2157 // Parses a processing instruction. < was already removed from the input. 2158 void _parsePI() 2159 { 2160 _entityPos = TextPos(_text.pos.line, _text.pos.col - 1); 2161 assert(_text.input.front == '?'); 2162 popFrontAndIncCol(_text); 2163 static if(config.skipPI == SkipPI.yes) 2164 _text.skipUntilAndDrop!"?>"(); 2165 else 2166 { 2167 immutable posAtName = _text.pos; 2168 if(_text.input.empty) 2169 throw new XMLParsingException("Unterminated processing instruction", posAtName); 2170 _type = EntityType.pi; 2171 _tagStack.sawEntity(); 2172 _name = takeName!'?'(_text); 2173 immutable posAtWS = _text.pos; 2174 stripWS(_text); 2175 checkNotEmpty(_text); 2176 _savedText.pos = _text.pos; 2177 _savedText.input = _text.takeUntilAndDrop!"?>"(); 2178 checkText!true(_savedText); 2179 if(walkLength(_name.save) == 3) 2180 { 2181 // FIXME icmp doesn't compile right now due to an issue with 2182 // byUTF that needs to be looked into. 2183 /+ 2184 import std.uni : icmp; 2185 if(icmp(_name.save, "xml") == 0) 2186 throw new XMLParsingException("Processing instructions cannot be named xml", posAtName); 2187 +/ 2188 auto temp = _name.save; 2189 if(temp.front == 'x' || temp.front == 'X') 2190 { 2191 temp.popFront(); 2192 if(temp.front == 'm' || temp.front == 'M') 2193 { 2194 temp.popFront(); 2195 if(temp.front == 'l' || temp.front == 'L') 2196 throw new XMLParsingException("Processing instructions cannot be named xml", posAtName); 2197 } 2198 } 2199 } 2200 } 2201 } 2202 2203 static if(compileInTests) unittest 2204 { 2205 import core.exception : AssertError; 2206 import std.algorithm.comparison : equal; 2207 import std.exception : assertNotThrown, assertThrown, collectException, enforce; 2208 import std.utf : byUTF; 2209 import dxml.internal : codeLen, testRangeFuncs; 2210 2211 static void test(alias func)(string text, string name, string expected, 2212 int row, int col, size_t line = __LINE__) 2213 { 2214 auto range = assertNotThrown!XMLParsingException(parseXML(func(text ~ "<root/>")), 2215 "unittest failure 1", __FILE__, line); 2216 enforce!AssertError(range.front.type == EntityType.pi, "unittest failure 2", __FILE__, line); 2217 enforce!AssertError(equal(range.front.name, name), "unittest failure 3", __FILE__, line); 2218 enforce!AssertError(equal(range.front.text, expected), "unittest failure 4", __FILE__, line); 2219 enforce!AssertError(range._text.pos == TextPos(row, col), "unittest failure 5", __FILE__, line); 2220 } 2221 2222 static void testFail(alias func)(string text, int row, int col, size_t line = __LINE__) 2223 { 2224 auto e = collectException!XMLParsingException(parseXML(func(text ~ "<root/>"))); 2225 enforce!AssertError(e !is null, "unittest failure 1", __FILE__, line); 2226 enforce!AssertError(e.pos == TextPos(row, col), "unittest failure 2", __FILE__, line); 2227 } 2228 2229 static foreach(func; testRangeFuncs) 2230 { 2231 test!func("<?a?>", "a", "", 1, 6); 2232 test!func("<?foo?>", "foo", "", 1, 8); 2233 test!func("<?foo.?>", "foo.", "", 1, 9); 2234 test!func("<?foo bar?>", "foo", "bar", 1, 12); 2235 test!func("<?xmf bar?>", "xmf", "bar", 1, 12); 2236 test!func("<?xmlfoo bar?>", "xmlfoo", "bar", 1, 15); 2237 test!func("<?foo bar baz?>", "foo", "bar baz", 1, 16); 2238 test!func("<?foo\nbar baz?>", "foo", "bar baz", 2, 10); 2239 test!func("<?foo \n bar baz?>", "foo", "bar baz", 2, 11); 2240 test!func("<?foo bar\nbaz?>", "foo", "bar\nbaz", 2, 6); 2241 test!func("<?dlang is awesome?>", "dlang", "is awesome", 1, 21); 2242 test!func("<?dlang is awesome! ?>", "dlang", "is awesome! ", 1, 23); 2243 test!func("<?dlang\n\nis\n\nawesome\n\n?>", "dlang", "is\n\nawesome\n\n", 7, 3); 2244 test!func("<?京都市 ディラン?>", "京都市", "ディラン", 1, codeLen!(func, "<?京都市 ディラン?>") + 1); 2245 test!func("<?foo bar&baz?>", "foo", "bar&baz", 1, 16); 2246 test!func("<?foo bar<baz?>", "foo", "bar<baz", 1, 16); 2247 test!func("<?pi ?>", "pi", "", 1, 8); 2248 test!func("<?pi\n?>", "pi", "", 2, 3); 2249 test!func("<?foo ??>", "foo", "?", 1, 10); 2250 test!func("<?pi some data ? > <??>", "pi", "some data ? > <?", 1, 24); 2251 2252 testFail!func("<?", 1, 3); 2253 testFail!func("<??>", 1, 3); 2254 testFail!func("<? ?>", 1, 3); 2255 testFail!func("<?xml?><?xml?>", 1, 10); 2256 testFail!func("<?XML?>", 1, 3); 2257 testFail!func("<?xMl?>", 1, 3); 2258 testFail!func("<?foo>", 1, 6); 2259 testFail!func("<? foo?>", 1, 3); 2260 testFail!func("<?\nfoo?>", 1, 3); 2261 testFail!func("<??foo?>", 1, 3); 2262 testFail!func("<?.foo?>", 1, 3); 2263 testFail!func("<?foo bar\vbaz?>", 1, 10); 2264 2265 { 2266 auto xml = func("<!DOCTYPE foo><?foo bar?><root/>"); 2267 auto range = assertNotThrown!XMLParsingException(parseXML(xml)); 2268 assert(range.front.type == EntityType.pi); 2269 assert(equal(range.front.name, "foo")); 2270 assert(equal(range.front.text, "bar")); 2271 } 2272 { 2273 auto xml = func("<root><?foo bar?></root>"); 2274 auto range = assertNotThrown!XMLParsingException(parseXML(xml)); 2275 assertNotThrown!XMLParsingException(range.popFront()); 2276 assert(equal(range.front.name, "foo")); 2277 assert(equal(range.front.text, "bar")); 2278 } 2279 { 2280 auto xml = func("<root/><?foo bar?>"); 2281 auto range = assertNotThrown!XMLParsingException(parseXML(xml)); 2282 assertNotThrown!XMLParsingException(range.popFront()); 2283 assert(equal(range.front.name, "foo")); 2284 assert(equal(range.front.text, "bar")); 2285 } 2286 2287 static foreach(pi; ["<?foo>", "<foo?>", "<? foo>"]) 2288 { 2289 { 2290 auto xml = func("<!DOCTYPE foo>" ~ pi ~ "<root/>"); 2291 assertThrown!XMLParsingException(parseXML(xml)); 2292 } 2293 { 2294 auto xml = func("<root>" ~ pi ~ "<root>"); 2295 auto range = assertNotThrown!XMLParsingException(parseXML(xml)); 2296 assertThrown!XMLParsingException(range.popFront()); 2297 } 2298 { 2299 auto xml = func("<root/>" ~ pi); 2300 auto range = assertNotThrown!XMLParsingException(parseXML(xml)); 2301 assertThrown!XMLParsingException(range.popFront()); 2302 } 2303 } 2304 2305 { 2306 auto xml = "<?one?>\n" ~ 2307 "<?two?>\n" ~ 2308 "<root>\n" ~ 2309 " <?three?>\n" ~ 2310 " <?four?>\n" ~ 2311 "</root>\n" ~ 2312 "<?five?>\n" ~ 2313 "<?six?>"; 2314 2315 auto text = func(xml); 2316 { 2317 auto range = parseXML(text.save); 2318 assert(range.front.type == EntityType.pi); 2319 assert(equal(range.front.name, "one")); 2320 assertNotThrown!XMLParsingException(range.popFront()); 2321 assert(range.front.type == EntityType.pi); 2322 assert(equal(range.front.name, "two")); 2323 assertNotThrown!XMLParsingException(range.popFront()); 2324 assert(range.front.type == EntityType.elementStart); 2325 assert(equal(range.front.name, "root")); 2326 assertNotThrown!XMLParsingException(range.popFront()); 2327 assert(range.front.type == EntityType.pi); 2328 assert(equal(range.front.name, "three")); 2329 assertNotThrown!XMLParsingException(range.popFront()); 2330 assert(range.front.type == EntityType.pi); 2331 assert(equal(range.front.name, "four")); 2332 assertNotThrown!XMLParsingException(range.popFront()); 2333 assert(range.front.type == EntityType.elementEnd); 2334 assert(equal(range.front.name, "root")); 2335 assertNotThrown!XMLParsingException(range.popFront()); 2336 assert(range.front.type == EntityType.pi); 2337 assert(equal(range.front.name, "five")); 2338 assertNotThrown!XMLParsingException(range.popFront()); 2339 assert(range.front.type == EntityType.pi); 2340 assert(equal(range.front.name, "six")); 2341 assertNotThrown!XMLParsingException(range.popFront()); 2342 assert(range.empty); 2343 } 2344 { 2345 auto range = parseXML!simpleXML(text.save); 2346 assert(range.front.type == EntityType.elementStart); 2347 assert(equal(range.front.name, "root")); 2348 assertNotThrown!XMLParsingException(range.popFront()); 2349 assert(range.front.type == EntityType.elementEnd); 2350 assert(equal(range.front.name, "root")); 2351 assertNotThrown!XMLParsingException(range.popFront()); 2352 assert(range.empty); 2353 } 2354 } 2355 } 2356 } 2357 2358 2359 // CDSect ::= CDStart CData CDEnd 2360 // CDStart ::= '<![CDATA[' 2361 // CData ::= (Char* - (Char* ']]>' Char*)) 2362 // CDEnd ::= ']]>' 2363 // Parses a CDATA. <![CDATA[ was already removed from the front of the input. 2364 void _parseCDATA() 2365 { 2366 _entityPos = TextPos(_text.pos.line, _text.pos.col - cast(int)"<![CDATA[".length); 2367 _type = EntityType.cdata; 2368 _tagStack.sawEntity(); 2369 _savedText.pos = _text.pos; 2370 _savedText.input = _text.takeUntilAndDrop!"]]>"; 2371 checkText!true(_savedText); 2372 _grammarPos = GrammarPos.contentCharData2; 2373 } 2374 2375 static if(compileInTests) unittest 2376 { 2377 import core.exception : AssertError; 2378 import std.algorithm.comparison : equal; 2379 import std.exception : assertNotThrown, collectException, enforce; 2380 import dxml.internal : codeLen, testRangeFuncs; 2381 2382 static void test(alias func)(string text, string expected, int row, int col, size_t line = __LINE__) 2383 { 2384 auto pos = TextPos(row, col + (row == 1 ? cast(int)"<root>".length : 0)); 2385 auto range = parseXML(func("<root>" ~ text ~ "<root/>")); 2386 assertNotThrown!XMLParsingException(range.popFront()); 2387 enforce!AssertError(range.front.type == EntityType.cdata, "unittest failure 1", __FILE__, line); 2388 enforce!AssertError(equal(range.front.text, expected), "unittest failure 2", __FILE__, line); 2389 enforce!AssertError(range._text.pos == pos, "unittest failure 3", __FILE__, line); 2390 } 2391 2392 static void testFail(alias func)(string text, int row, int col, size_t line = __LINE__) 2393 { 2394 auto pos = TextPos(row, col + (row == 1 ? cast(int)"<root>".length : 0)); 2395 auto range = parseXML(func("<root>" ~ text ~ "<root/>")); 2396 auto e = collectException!XMLParsingException(range.popFront()); 2397 enforce!AssertError(e !is null, "unittest failure 1", __FILE__, line); 2398 enforce!AssertError(e.pos == pos, "unittest failure 2", __FILE__, line); 2399 } 2400 2401 static foreach(func; testRangeFuncs) 2402 { 2403 test!func("<![CDATA[]]>", "", 1, 13); 2404 test!func("<![CDATA[hello world]]>", "hello world", 1, 24); 2405 test!func("<![CDATA[\nhello\n\nworld\n]]>", "\nhello\n\nworld\n", 5, 4); 2406 test!func("<![CDATA[京都市]]>", "京都市", 1, codeLen!(func, "<![CDATA[京都市]>") + 2); 2407 test!func("<![CDATA[<><><><><<<<>>>>>> ] ] ]> <]> <<>> ][][] >> ]]>", 2408 "<><><><><<<<>>>>>> ] ] ]> <]> <<>> ][][] >> ", 1, 57); 2409 test!func("<![CDATA[&]]>", "&", 1, 14); 2410 2411 testFail!func("<[CDATA[]>", 1, 2); 2412 testFail!func("<![CDAT[]>", 1, 2); 2413 testFail!func("<![CDATA]>", 1, 2); 2414 testFail!func("<![CDATA[>", 1, 10); 2415 testFail!func("<![CDATA[]", 1, 10); 2416 testFail!func("<![CDATA[]>", 1, 10); 2417 testFail!func("<![CDATA[ \v ]]>", 1, 11); 2418 testFail!func("<![CDATA[ \n\n \v \n ]]>", 3, 2); 2419 } 2420 } 2421 2422 2423 // doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S? ('[' intSubset ']' S?)? '>' 2424 // DeclSep ::= PEReference | S 2425 // intSubset ::= (markupdecl | DeclSep)* 2426 // markupdecl ::= elementdecl | AttlistDecl | EntityDecl | NotationDecl | PI | Comment 2427 // Parse doctypedecl after GrammarPos.prologMisc1. 2428 // <!DOCTYPE and any whitespace after it should have already been removed 2429 // from the input. 2430 void _parseDoctypeDecl() 2431 { 2432 outer: while(true) 2433 { 2434 _text.skipToOneOf!('"', '\'', '[', '>')(); 2435 switch(_text.input.front) 2436 { 2437 static foreach(quote; ['"', '\'']) 2438 { 2439 case quote: 2440 { 2441 popFrontAndIncCol(_text); 2442 _text.skipUntilAndDrop!([quote])(); 2443 continue outer; 2444 } 2445 } 2446 case '[': 2447 { 2448 popFrontAndIncCol(_text); 2449 while(true) 2450 { 2451 checkNotEmpty(_text); 2452 _text.skipToOneOf!('"', '\'', ']')(); 2453 switch(_text.input.front) 2454 { 2455 case '"': 2456 { 2457 popFrontAndIncCol(_text); 2458 _text.skipUntilAndDrop!`"`(); 2459 continue; 2460 } 2461 case '\'': 2462 { 2463 popFrontAndIncCol(_text); 2464 _text.skipUntilAndDrop!`'`(); 2465 continue; 2466 } 2467 case ']': 2468 { 2469 popFrontAndIncCol(_text); 2470 stripWS(_text); 2471 if(_text.input.empty || _text.input.front != '>') 2472 throw new XMLParsingException("Incorrectly terminated <!DOCTYPE> section.", _text.pos); 2473 popFrontAndIncCol(_text); 2474 _parseAtPrologMisc!2(); 2475 return; 2476 } 2477 default: assert(0); 2478 } 2479 } 2480 } 2481 case '>': 2482 { 2483 popFrontAndIncCol(_text); 2484 _parseAtPrologMisc!2(); 2485 break; 2486 } 2487 default: assert(0); 2488 } 2489 break; 2490 } 2491 } 2492 2493 static if(compileInTests) unittest 2494 { 2495 import core.exception : AssertError; 2496 import std.exception : assertNotThrown, collectException, enforce; 2497 import dxml.internal : testRangeFuncs; 2498 2499 static void test(alias func)(string text, int row, int col, size_t line = __LINE__) 2500 { 2501 auto pos = TextPos(row, col + cast(int)"<root/>".length); 2502 auto range = assertNotThrown!XMLParsingException(parseXML(func(text ~ "<root/>")), 2503 "unittest failure 1", __FILE__, line); 2504 enforce!AssertError(range.front.type == EntityType.elementEmpty, "unittest failure 2", __FILE__, line); 2505 enforce!AssertError(range._text.pos == pos, "unittest failure 3", __FILE__, line); 2506 } 2507 2508 static void testFail(alias func)(string text, int row, int col, size_t line = __LINE__) 2509 { 2510 auto e = collectException!XMLParsingException(parseXML(func(text ~ "<root/>"))); 2511 enforce!AssertError(e !is null, "unittest failure 1", __FILE__, line); 2512 enforce!AssertError(e.pos == TextPos(row, col), "unittest failure 2", __FILE__, line); 2513 } 2514 2515 static foreach(func; testRangeFuncs) 2516 { 2517 test!func("<!DOCTYPE name>", 1, 16); 2518 test!func("<!DOCTYPE \n\n\n name>", 4, 7); 2519 test!func("<!DOCTYPE name \n\n\n >", 4, 3); 2520 2521 test!func("<!DOCTYPE name []>", 1, 19); 2522 test!func("<!DOCTYPE \n\n\n name []>", 4, 10); 2523 test!func("<!DOCTYPE name \n\n\n []>", 4, 5); 2524 2525 test!func(`<!DOCTYPE name PUBLIC "'''" '"""'>`, 1, 35); 2526 test!func(`<!DOCTYPE name PUBLIC "'''" '"""' []>`, 1, 38); 2527 test!func(`<!DOCTYPE name PUBLIC 'foo' "'''">`, 1, 35); 2528 test!func(`<!DOCTYPE name PUBLIC 'foo' '"""' []>`, 1, 38); 2529 2530 test!func("<!DOCTYPE name [ <!ELEMENT foo EMPTY > ]>", 1, 42); 2531 test!func("<!DOCTYPE name [ <!ELEMENT bar ANY > ]>", 1, 40); 2532 test!func("<!DOCTYPE name [ <!ELEMENT mixed (#PCDATA) > ]>", 1, 48); 2533 test!func("<!DOCTYPE name [ <!ELEMENT mixed (#PCDATA | foo)> ]>", 1, 53); 2534 test!func("<!DOCTYPE name [ <!ELEMENT kids (foo) > ]>", 1, 43); 2535 test!func("<!DOCTYPE name [ <!ELEMENT kids (foo | bar)> ]>", 1, 48); 2536 2537 test!func("<!DOCTYPE name [ <!ATTLIST foo> ]>", 1, 35); 2538 test!func("<!DOCTYPE name [ <!ATTLIST foo def CDATA #REQUIRED> ]>", 1, 55); 2539 2540 test!func(`<!DOCTYPE name [ <!ENTITY foo "bar"> ]>`, 1, 40); 2541 test!func(`<!DOCTYPE name [ <!ENTITY foo 'bar'> ]>`, 1, 40); 2542 test!func(`<!DOCTYPE name [ <!ENTITY foo SYSTEM 'sys'> ]>`, 1, 47); 2543 test!func(`<!DOCTYPE name [ <!ENTITY foo PUBLIC "'''" 'sys'> ]>`, 1, 53); 2544 2545 test!func(`<!DOCTYPE name [ <!NOTATION note PUBLIC 'blah'> ]>`, 1, 51); 2546 2547 test!func("<!DOCTYPE name [ <?pi> ]>", 1, 26); 2548 2549 test!func("<!DOCTYPE name [ <!-- coment --> ]>", 1, 36); 2550 2551 test!func("<!DOCTYPE name [ <?pi> <!----> <!ELEMENT blah EMPTY> ]>", 1, 56); 2552 test!func("<!DOCTYPE \nname\n[\n<?pi> \n <!---->\n<!ENTITY foo '\n\n'\n>\n]>", 10, 3); 2553 2554 test!func("<!DOCTYPE doc [\n" ~ 2555 "<!ENTITY e '<![CDATA[Tim Michael]]>'>\n" ~ 2556 "]>\n", 4, 1); 2557 2558 testFail!func("<!DOCTYP name>", 1, 2); 2559 testFail!func("<!DOCTYPEname>", 1, 10); 2560 testFail!func("<!DOCTYPE name1><!DOCTYPE name2>", 1, 18); 2561 testFail!func("<!DOCTYPE\n\nname1><!DOCTYPE name2>", 3, 8); 2562 testFail!func("<!DOCTYPE name [ ]<!--comment-->", 1, 19); 2563 2564 // FIXME This really should have the exception point at the quote and 2565 // say that it couldn't find the matching quote rather than point at 2566 // the character after it and say that it couldn't find a quote, but 2567 // that requires reworking some helper functions with better error 2568 // messages in mind. 2569 testFail!func(`<!DOCTYPE student SYSTEM "student".dtd"[` ~ 2570 "\n<!ELEMENT student (#PCDATA)>\n" ~ 2571 "]>", 1, 40); 2572 } 2573 } 2574 2575 2576 // Parse a start tag or empty element tag. It could be the root element, or 2577 // it could be a sub-element. 2578 // < was already removed from the front of the input. 2579 void _parseElementStart() 2580 { 2581 _entityPos = TextPos(_text.pos.line, _text.pos.col - 1); 2582 _savedText.pos = _text.pos; 2583 _savedText.input = _text.takeUntilAndDrop!(">", true)(); 2584 2585 if(_savedText.input.empty) 2586 throw new XMLParsingException("Tag missing name", _savedText.pos); 2587 if(_savedText.input.front == '/') 2588 throw new XMLParsingException("Invalid end tag", _savedText.pos); 2589 2590 if(_savedText.input.length > 1) 2591 { 2592 auto temp = _savedText.input.save; 2593 temp.popFrontN(temp.length - 1); 2594 if(temp.front == '/') 2595 { 2596 _savedText.input = _savedText.input.takeExactly(_savedText.input.length - 1); 2597 2598 static if(config.splitEmpty == SplitEmpty.no) 2599 { 2600 _type = EntityType.elementEmpty; 2601 _tagStack.sawEntity(); 2602 _grammarPos = _tagStack.depth == 0 ? GrammarPos.endMisc : GrammarPos.contentCharData2; 2603 } 2604 else 2605 { 2606 _type = EntityType.elementStart; 2607 _tagStack.sawEntity(); 2608 _grammarPos = GrammarPos.splittingEmpty; 2609 } 2610 } 2611 else 2612 { 2613 _type = EntityType.elementStart; 2614 _tagStack.sawEntity(); 2615 _grammarPos = GrammarPos.contentCharData1; 2616 } 2617 } 2618 else 2619 { 2620 _type = EntityType.elementStart; 2621 _tagStack.sawEntity(); 2622 _grammarPos = GrammarPos.contentCharData1; 2623 } 2624 2625 _name = _savedText.takeName(); 2626 // The attributes should be all that's left in savedText. 2627 if(_tagStack.atMax) 2628 { 2629 auto temp = _savedText.save; 2630 auto attrChecker = _tagStack.attrChecker; 2631 2632 while(true) 2633 { 2634 immutable wasWS = stripWS(temp); 2635 if(temp.input.empty) 2636 break; 2637 if(!wasWS) 2638 throw new XMLParsingException("Whitespace missing before attribute name", temp.pos); 2639 2640 immutable attrPos = temp.pos; 2641 attrChecker.pushAttr(temp.takeName!'='(), attrPos); 2642 stripWS(temp); 2643 2644 checkNotEmpty(temp); 2645 if(temp.input.front != '=') 2646 throw new XMLParsingException("= missing", temp.pos); 2647 popFrontAndIncCol(temp); 2648 2649 stripWS(temp); 2650 temp.takeAttValue(); 2651 } 2652 2653 attrChecker.checkAttrs(); 2654 } 2655 } 2656 2657 static if(compileInTests) unittest 2658 { 2659 import core.exception : AssertError; 2660 import std.algorithm.comparison : equal; 2661 import std.exception : assertNotThrown, collectException, enforce; 2662 import dxml.internal : codeLen, testRangeFuncs; 2663 2664 static void test(alias func)(string text, EntityType type, string name, 2665 int row, int col, size_t line = __LINE__) 2666 { 2667 auto range = assertNotThrown!XMLParsingException(parseXML(func(text))); 2668 enforce!AssertError(range.front.type == type, "unittest failure 1", __FILE__, line); 2669 enforce!AssertError(equal(range.front.name, name), "unittest failure 2", __FILE__, line); 2670 enforce!AssertError(range._text.pos == TextPos(row, col), "unittest failure 3", __FILE__, line); 2671 } 2672 2673 static void testFail(alias func)(string text, int row, int col, size_t line = __LINE__) 2674 { 2675 auto xml = func(text); 2676 auto e = collectException!XMLParsingException(parseXML(func(text))); 2677 enforce!AssertError(e !is null, "unittest failure 1", __FILE__, line); 2678 enforce!AssertError(e.pos == TextPos(row, col), "unittest failure 2", __FILE__, line); 2679 } 2680 2681 static foreach(func; testRangeFuncs) 2682 { 2683 test!func("<a/>", EntityType.elementEmpty, "a", 1, 5); 2684 test!func("<a></a>", EntityType.elementStart, "a", 1, 4); 2685 test!func("<root/>", EntityType.elementEmpty, "root", 1, 8); 2686 test!func("<root></root>", EntityType.elementStart, "root", 1, 7); 2687 test!func("<foo/>", EntityType.elementEmpty, "foo", 1, 7); 2688 test!func("<foo></foo>", EntityType.elementStart, "foo", 1, 6); 2689 test!func("<foo />", EntityType.elementEmpty, "foo", 1, 14); 2690 test!func("<foo ></foo>", EntityType.elementStart, "foo", 1, 13); 2691 test!func("<foo \n\n\n />", EntityType.elementEmpty, "foo", 4, 4); 2692 test!func("<foo \n\n\n ></foo>", EntityType.elementStart, "foo", 4, 3); 2693 test!func("<foo.></foo.>", EntityType.elementStart, "foo.", 1, 7); 2694 test!func(`<京都市></京都市>`, EntityType.elementStart, "京都市", 1, codeLen!(func, `<京都市>`) + 1); 2695 2696 testFail!func(`<.foo/>`, 1, 2); 2697 testFail!func(`<>`, 1, 2); 2698 testFail!func(`</>`, 1, 2); 2699 testFail!func(`</foo>`, 1, 2); 2700 2701 { 2702 auto range = assertNotThrown!XMLParsingException(parseXML!simpleXML(func("<root/>"))); 2703 assert(range.front.type == EntityType.elementStart); 2704 assert(equal(range.front.name, "root")); 2705 assert(range._text.pos == TextPos(1, 8)); 2706 assertNotThrown!XMLParsingException(range.popFront()); 2707 assert(range.front.type == EntityType.elementEnd); 2708 assert(equal(range.front.name, "root")); 2709 assert(range._text.pos == TextPos(1, 8)); 2710 } 2711 } 2712 } 2713 2714 2715 // Parse an end tag. It could be the root element, or it could be a 2716 // sub-element. 2717 // </ was already removed from the front of the input. 2718 void _parseElementEnd() 2719 { 2720 if(_text.input.empty) 2721 throw new XMLParsingException("Unterminated end tag", _text.pos); 2722 _entityPos = TextPos(_text.pos.line, _text.pos.col - 2); 2723 _type = EntityType.elementEnd; 2724 _tagStack.sawEntity(); 2725 immutable namePos = _text.pos; 2726 _name = _text.takeName!'>'(); 2727 stripWS(_text); 2728 if(_text.input.empty || _text.input.front != '>') 2729 { 2730 throw new XMLParsingException("There can only be whitespace between an end tag's name and the >", 2731 _text.pos); 2732 } 2733 popFrontAndIncCol(_text); 2734 _tagStack.popTag(_name.save, namePos); 2735 _grammarPos = _tagStack.depth == 0 ? GrammarPos.endMisc : GrammarPos.contentCharData2; 2736 } 2737 2738 static if(compileInTests) unittest 2739 { 2740 import core.exception : AssertError; 2741 import std.algorithm.comparison : equal; 2742 import std.exception : assertNotThrown, collectException, enforce; 2743 import dxml.internal : codeLen, testRangeFuncs; 2744 2745 static void test(alias func)(string text, string name, int row, int col, size_t line = __LINE__) 2746 { 2747 auto range = assertNotThrown!XMLParsingException(parseXML(func(text))); 2748 range.popFront(); 2749 enforce!AssertError(range.front.type == EntityType.elementEnd, "unittest failure 1", __FILE__, line); 2750 enforce!AssertError(equal(range.front.name, name), "unittest failure 2", __FILE__, line); 2751 enforce!AssertError(range._text.pos == TextPos(row, col), "unittest failure 3", __FILE__, line); 2752 } 2753 2754 static void testFail(alias func)(string text, int row, int col, size_t line = __LINE__) 2755 { 2756 auto range = parseXML(func(text)); 2757 auto e = collectException!XMLParsingException(range.popFront()); 2758 enforce!AssertError(e !is null, "unittest failure 1", __FILE__, line); 2759 enforce!AssertError(e.pos == TextPos(row, col), "unittest failure 2", __FILE__, line); 2760 } 2761 2762 static foreach(func; testRangeFuncs) 2763 { 2764 test!func("<a></a>", "a", 1, 8); 2765 test!func("<foo></foo>", "foo", 1, 12); 2766 test!func("<foo ></foo >", "foo", 1, 20); 2767 test!func("<foo \n ></foo \n >", "foo", 3, 3); 2768 test!func("<foo>\n\n\n</foo>", "foo", 4, 7); 2769 test!func("<foo.></foo.>", "foo.", 1, 14); 2770 test!func(`<京都市></京都市>`, "京都市", 1, codeLen!(func, `<京都市></京都市>`) + 1); 2771 2772 testFail!func(`<foo></ foo>`, 1, 8); 2773 testFail!func(`<foo></bar>`, 1, 8); 2774 testFail!func(`<foo></fo>`, 1, 8); 2775 testFail!func(`<foo></food>`, 1, 8); 2776 testFail!func(`<a></>`, 1, 6); 2777 testFail!func(`<a></`, 1, 6); 2778 testFail!func(`<a><`, 1, 5); 2779 testFail!func(`<a></a b='42'>`, 1, 8); 2780 } 2781 } 2782 2783 2784 // GrammarPos.contentCharData1 2785 // content ::= CharData? ((element | Reference | CDSect | PI | Comment) CharData?)* 2786 // Parses at either CharData?. Nothing from the CharData? (or what's after it 2787 // if it's not there) has been consumed. 2788 void _parseAtContentCharData() 2789 { 2790 checkNotEmpty(_text); 2791 auto orig = _text.save; 2792 stripWS(_text); 2793 checkNotEmpty(_text); 2794 if(_text.input.front != '<') 2795 { 2796 _text = orig; 2797 _entityPos = _text.pos; 2798 _type = EntityType.text; 2799 _tagStack.sawEntity(); 2800 _savedText.pos = _text.pos; 2801 _savedText.input = _text.takeUntilAndDrop!"<"(); 2802 checkText!false(_savedText); 2803 checkNotEmpty(_text); 2804 if(_text.input.front == '/') 2805 { 2806 popFrontAndIncCol(_text); 2807 _grammarPos = GrammarPos.endTag; 2808 } 2809 else 2810 _grammarPos = GrammarPos.contentMid; 2811 } 2812 else 2813 { 2814 popFrontAndIncCol(_text); 2815 checkNotEmpty(_text); 2816 if(_text.input.front == '/') 2817 { 2818 popFrontAndIncCol(_text); 2819 _parseElementEnd(); 2820 } 2821 else 2822 _parseAtContentMid(); 2823 } 2824 } 2825 2826 static if(compileInTests) unittest 2827 { 2828 import core.exception : AssertError; 2829 import std.algorithm.comparison : equal; 2830 import std.exception : assertNotThrown, collectException, enforce; 2831 import dxml.internal : codeLen, testRangeFuncs; 2832 2833 static void test(alias func, ThrowOnEntityRef toer)(string text, int row, int col, size_t line = __LINE__) 2834 { 2835 auto pos = TextPos(row, col + (cast(int)(row == 1 ? "<root></" : "</").length)); 2836 auto range = parseXML!(makeConfig(toer))(func("<root>" ~ text ~ "</root>")); 2837 assertNotThrown!XMLParsingException(range.popFront()); 2838 enforce!AssertError(range.front.type == EntityType.text, "unittest failure 1", __FILE__, line); 2839 enforce!AssertError(equal(range.front.text, text), "unittest failure 2", __FILE__, line); 2840 enforce!AssertError(range._text.pos == pos, "unittest failure 3", __FILE__, line); 2841 } 2842 2843 static void testFail(alias func, ThrowOnEntityRef toer)(string text, int row, int col, size_t line = __LINE__) 2844 { 2845 auto pos = TextPos(row, col + (row == 1 ? cast(int)"<root>".length : 0)); 2846 auto range = parseXML!(makeConfig(toer))(func("<root>" ~ text ~ "</root>")); 2847 auto e = collectException!XMLParsingException(range.popFront()); 2848 enforce!AssertError(e !is null, "unittest failure 1", __FILE__, line); 2849 enforce!AssertError(e.pos == pos, "unittest failure 2", __FILE__, line); 2850 } 2851 2852 static foreach(func; testRangeFuncs) 2853 { 2854 static foreach(toer; [ThrowOnEntityRef.yes, ThrowOnEntityRef.no]) 2855 { 2856 test!(func, toer)("hello world", 1, 12); 2857 test!(func, toer)("\nhello\n\nworld", 4, 6); 2858 test!(func, toer)("京都市", 1, codeLen!(func, "京都市") + 1); 2859 test!(func, toer)("B", 1, 7); 2860 test!(func, toer)("]", 1, 2); 2861 test!(func, toer)("]]", 1, 3); 2862 test!(func, toer)("]>", 1, 3); 2863 test!(func, toer)("foo \n\n < \n bar", 4, 5); 2864 2865 testFail!(func, toer)("&", 1, 1); 2866 testFail!(func, toer)("&;", 1, 1); 2867 testFail!(func, toer)("&f", 1, 1); 2868 testFail!(func, toer)("\v", 1, 1); 2869 testFail!(func, toer)("hello&world", 1, 6); 2870 testFail!(func, toer)("hello\vworld", 1, 6); 2871 testFail!(func, toer)("hello&;world", 1, 6); 2872 testFail!(func, toer)("hello&#;world", 1, 6); 2873 testFail!(func, toer)("hello&#x;world", 1, 6); 2874 testFail!(func, toer)("hello&.;world", 1, 6); 2875 testFail!(func, toer)("\n\nfoo\nbar&.;", 4, 4); 2876 2877 testFail!(func, toer)("]]>", 1, 1); 2878 testFail!(func, toer)("foo]]>bar", 1, 4); 2879 2880 static if(toer == ThrowOnEntityRef.yes) 2881 { 2882 testFail!(func, toer)("&foo; &bar baz", 1, 1); 2883 testFail!(func, toer)("foo \n\n &e; \n bar", 3, 2); 2884 } 2885 else 2886 { 2887 testFail!(func, toer)("&foo; &bar baz", 1, 7); 2888 test!(func, toer)("foo \n\n &e; \n bar", 4, 5); 2889 } 2890 } 2891 } 2892 } 2893 2894 2895 // GrammarPos.contentMid 2896 // content ::= CharData? ((element | Reference | CDSect | PI | Comment) CharData?)* 2897 // The text right after the start tag was what was parsed previously. So, 2898 // that first CharData? was what was parsed last, and this parses starting 2899 // right after. The < should have already been removed from the input. 2900 void _parseAtContentMid() 2901 { 2902 // Note that References are treated as part of the CharData and not 2903 // parsed out by the EntityRange (see EntityRange.text). 2904 2905 switch(_text.input.front) 2906 { 2907 // Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->' 2908 // CDSect ::= CDStart CData CDEnd 2909 // CDStart ::= '<![CDATA[' 2910 // CData ::= (Char* - (Char* ']]>' Char*)) 2911 // CDEnd ::= ']]>' 2912 case '!': 2913 { 2914 popFrontAndIncCol(_text); 2915 if(_text.stripStartsWith("--")) 2916 { 2917 _parseComment(); 2918 static if(config.skipComments == SkipComments.yes) 2919 _parseAtContentCharData(); 2920 else 2921 _grammarPos = GrammarPos.contentCharData2; 2922 } 2923 else if(_text.stripStartsWith("[CDATA[")) 2924 _parseCDATA(); 2925 else 2926 { 2927 immutable bangPos = TextPos(_text.pos.line, _text.pos.col - 1); 2928 throw new XMLParsingException("Expected Comment or CDATA section", bangPos); 2929 } 2930 break; 2931 } 2932 // PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>' 2933 case '?': 2934 { 2935 _parsePI(); 2936 _grammarPos = GrammarPos.contentCharData2; 2937 static if(config.skipPI == SkipPI.yes) 2938 popFront(); 2939 break; 2940 } 2941 // element ::= EmptyElemTag | STag content ETag 2942 default: 2943 { 2944 _parseElementStart(); 2945 break; 2946 } 2947 } 2948 } 2949 2950 2951 // This parses the Misc* that come after the root element. 2952 void _parseAtEndMisc() 2953 { 2954 // Misc ::= Comment | PI | S 2955 2956 stripWS(_text); 2957 2958 if(_text.input.empty) 2959 { 2960 _grammarPos = GrammarPos.documentEnd; 2961 return; 2962 } 2963 2964 if(_text.input.front != '<') 2965 throw new XMLParsingException("Expected <", _text.pos); 2966 popFrontAndIncCol(_text); 2967 checkNotEmpty(_text); 2968 2969 switch(_text.input.front) 2970 { 2971 // Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->' 2972 case '!': 2973 { 2974 popFrontAndIncCol(_text); 2975 if(_text.stripStartsWith("--")) 2976 { 2977 _parseComment(); 2978 static if(config.skipComments == SkipComments.yes) 2979 _parseAtEndMisc(); 2980 break; 2981 } 2982 immutable bangPos = TextPos(_text.pos.line, _text.pos.col - 1); 2983 throw new XMLParsingException("Expected Comment", bangPos); 2984 } 2985 // PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>' 2986 case '?': 2987 { 2988 _parsePI(); 2989 static if(config.skipPI == SkipPI.yes) 2990 popFront(); 2991 break; 2992 } 2993 default: throw new XMLParsingException("Must be a comment or PI", _text.pos); 2994 } 2995 } 2996 2997 // Used for keeping track of the names of start tags so that end tags can be 2998 // verified as well as making it possible to avoid redoing other validation. 2999 // We keep track of the total number of entities which have been parsed thus 3000 // far so that only whichever EntityRange is farthest along in parsing 3001 // actually adds or removes tags from the TagStack, and the parser can skip 3002 // some of the validation for ranges that are farther behind. That way, the 3003 // end tags get verified, but we only have one stack. If the stack were 3004 // duplicated with every call to save, then there would be a lot more 3005 // allocations, which we don't want. But because we only need to verify the 3006 // end tags once, we can get away with having a shared tag stack. The cost 3007 // is that we have to keep track of how many tags we've parsed so that we 3008 // know if an EntityRange should actually be pushing or popping tags from 3009 // the stack, but that's a lot cheaper than duplicating the stack, and it's 3010 // a lot less annoying then making EntityRange an input range and not a 3011 // forward range or making it a cursor rather than a range. 3012 struct TagStack 3013 { 3014 void pushTag(Taken tagName) 3015 { 3016 if(entityCount++ == state.maxEntities) 3017 { 3018 ++state.maxEntities; 3019 put(state.tags, tagName); 3020 } 3021 ++depth; 3022 } 3023 3024 void popTag(Taken tagName, TextPos pos) 3025 { 3026 import std.algorithm : equal; 3027 import std.format : format; 3028 if(entityCount++ == state.maxEntities) 3029 { 3030 assert(!state.tags.data.empty); 3031 if(!equal(state.tags.data.back.save, tagName.save)) 3032 { 3033 enum fmt = "Name of end tag </%s> does not match corresponding start tag <%s>"; 3034 throw new XMLParsingException(format!fmt(tagName, state.tags.data.back), pos); 3035 } 3036 ++state.maxEntities; 3037 state.tags.shrinkTo(state.tags.data.length - 1); 3038 } 3039 --depth; 3040 } 3041 3042 @property auto attrChecker() 3043 { 3044 assert(atMax); 3045 3046 static struct AttrChecker 3047 { 3048 void pushAttr(Taken attrName, TextPos attrPos) 3049 { 3050 put(state.attrs, Attribute(attrName, attrPos)); 3051 } 3052 3053 void checkAttrs() 3054 { 3055 import std.algorithm.comparison : cmp, equal; 3056 import std.algorithm.sorting : sort; 3057 import std.conv : to; 3058 3059 if(state.attrs.data.length < 2) 3060 return; 3061 3062 sort!((a,b) => cmp(a.taken.save, b.taken.save) < 0)(state.attrs.data); 3063 auto prev = state.attrs.data.front; 3064 foreach(attr; state.attrs.data[1 .. $]) 3065 { 3066 if(equal(prev.taken, attr.taken)) 3067 throw new XMLParsingException("Duplicate attribute name", attr.pos); 3068 prev = attr; 3069 } 3070 } 3071 3072 ~this() 3073 { 3074 state.attrs.clear(); 3075 } 3076 3077 SharedState* state; 3078 } 3079 3080 return AttrChecker(state); 3081 } 3082 3083 void sawEntity() 3084 { 3085 if(entityCount++ == state.maxEntities) 3086 ++state.maxEntities; 3087 } 3088 3089 @property bool atMax() 3090 { 3091 return entityCount == state.maxEntities; 3092 } 3093 3094 struct Attribute 3095 { 3096 Taken taken; 3097 TextPos pos; 3098 } 3099 3100 struct SharedState 3101 { 3102 import std.array : Appender; 3103 3104 Appender!(Taken[]) tags; 3105 Appender!(Attribute[]) attrs; 3106 size_t maxEntities; 3107 } 3108 3109 static create() 3110 { 3111 TagStack tagStack; 3112 tagStack.state = new SharedState; 3113 tagStack.state.tags.reserve(10); 3114 tagStack.state.attrs.reserve(10); 3115 return tagStack; 3116 } 3117 3118 SharedState* state; 3119 size_t entityCount; 3120 int depth; 3121 } 3122 3123 static if(compileInTests) unittest 3124 { 3125 import core.exception : AssertError; 3126 import std.algorithm.comparison : equal; 3127 import std.exception : assertNotThrown, collectException, enforce; 3128 import dxml.internal : testRangeFuncs; 3129 3130 static void test(alias func)(string text, size_t line = __LINE__) 3131 { 3132 auto xml = func(text); 3133 static foreach(config; someTestConfigs) 3134 {{ 3135 auto range = assertNotThrown!XMLParsingException(parseXML!config(xml.save), "unittest failure 1", 3136 __FILE__, line); 3137 assertNotThrown!XMLParsingException(walkLength(range), "unittest failure 2", __FILE__, line); 3138 }} 3139 } 3140 3141 static void testFail(alias func)(string text, int row, int col, size_t line = __LINE__) 3142 { 3143 auto xml = func(text); 3144 static foreach(config; someTestConfigs) 3145 {{ 3146 auto range = assertNotThrown!XMLParsingException(parseXML!config(xml.save), "unittest failure 1", 3147 __FILE__, line); 3148 auto e = collectException!XMLParsingException(walkLength(range)); 3149 enforce!AssertError(e !is null, "unittest failure 2", __FILE__, line); 3150 enforce!AssertError(e.pos == TextPos(row, col), "unittest failure 3", __FILE__, line); 3151 }} 3152 } 3153 3154 static foreach(func; testRangeFuncs) 3155 { 3156 test!func("<root></root>"); 3157 test!func("<root><a></a></root>"); 3158 test!func("<root><a><b></b></a></root>"); 3159 test!func("<root><a><b></b></a></root>"); 3160 test!func("<root><a><b></b></a><foo><bar></bar></foo></root>"); 3161 test!func("<a>\n" ~ 3162 " <b>\n" ~ 3163 " <c>\n" ~ 3164 " <d>\n" ~ 3165 " <e>\n" ~ 3166 " <f>\n" ~ 3167 " <g>\n" ~ 3168 " <h>\n" ~ 3169 " <i><i><i><i>\n" ~ 3170 " </i></i></i></i>\n" ~ 3171 " <i>\n" ~ 3172 " <j>\n" ~ 3173 " <k>\n" ~ 3174 " <l>\n" ~ 3175 " <m>\n" ~ 3176 " <n>\n" ~ 3177 " <o>\n" ~ 3178 " <p>\n" ~ 3179 " <q>\n" ~ 3180 " <r>\n" ~ 3181 " <s>\n" ~ 3182 " <!-- comment --> <?pi?> <t><u><v></v></u></t>\n" ~ 3183 " </s>\n" ~ 3184 " </r>\n" ~ 3185 " </q>\n" ~ 3186 " </p></o></n></m>\n" ~ 3187 " </l>\n" ~ 3188 " </k>\n" ~ 3189 " </j>\n" ~ 3190 "</i></h>" ~ 3191 " </g>\n" ~ 3192 " </f>\n" ~ 3193 " </e>\n" ~ 3194 " </d>\n" ~ 3195 " </c>\n" ~ 3196 " </b>\n" ~ 3197 "</a>"); 3198 test!func(`<京都市></京都市>`); 3199 3200 testFail!func(`<a>`, 1, 4); 3201 testFail!func(`<foo></foobar>`, 1, 8); 3202 testFail!func(`<foobar></foo>`, 1, 11); 3203 testFail!func(`<a><\a>`, 1, 5); 3204 testFail!func(`<a><a/>`, 1, 8); 3205 testFail!func(`<a><b>`, 1, 7); 3206 testFail!func(`<a><b><c>`, 1, 10); 3207 testFail!func(`<a></a><b>`, 1, 9); 3208 testFail!func(`<a></a><b></b>`, 1, 9); 3209 testFail!func(`<a><b></a></b>`, 1, 9); 3210 testFail!func(`<a><b><c></c><b></a>`, 1, 19); 3211 testFail!func(`<a><b></c><c></b></a>`, 1, 9); 3212 testFail!func(`<a><b></c></b></a>`, 1, 9); 3213 testFail!func("<a>\n" ~ 3214 " <b>\n" ~ 3215 " <c>\n" ~ 3216 " <d>\n" ~ 3217 " <e>\n" ~ 3218 " <f>\n" ~ 3219 " </f>\n" ~ 3220 " </e>\n" ~ 3221 " </d>\n" ~ 3222 " </c>\n" ~ 3223 " </b>\n" ~ 3224 "<a>", 12, 4); 3225 testFail!func("<a>\n" ~ 3226 " <b>\n" ~ 3227 " <c>\n" ~ 3228 " <d>\n" ~ 3229 " <e>\n" ~ 3230 " <f>\n" ~ 3231 " </f>\n" ~ 3232 " </e>\n" ~ 3233 " </d>\n" ~ 3234 " </c>\n" ~ 3235 " </b>\n" ~ 3236 "</q>", 12, 3); 3237 } 3238 } 3239 3240 3241 struct Text(R) 3242 { 3243 alias config = cfg; 3244 alias Input = R; 3245 3246 Input input; 3247 TextPos pos; 3248 3249 @property save() { return typeof(this)(input.save, pos); } 3250 } 3251 3252 3253 alias Taken = typeof(takeExactly(byCodeUnit(R.init), 42)); 3254 3255 3256 EntityType _type; 3257 TextPos _entityPos; 3258 auto _grammarPos = GrammarPos.documentStart; 3259 3260 Taken _name; 3261 TagStack _tagStack; 3262 3263 Text!(typeof(byCodeUnit(R.init))) _text; 3264 Text!Taken _savedText; 3265 3266 3267 this(R xmlText) 3268 { 3269 _tagStack = TagStack.create(); 3270 _text.input = byCodeUnit(xmlText); 3271 3272 // None of these initializations should be required. https://issues.dlang.org/show_bug.cgi?id=13945 3273 _savedText = typeof(_savedText).init; 3274 _name = typeof(_name).init; 3275 3276 popFront(); 3277 } 3278 } 3279 3280 /// Ditto 3281 EntityRange!(config, R) parseXML(Config config = Config.init, R)(R xmlText) 3282 if(isForwardRange!R && isSomeChar!(ElementType!R)) 3283 { 3284 return EntityRange!(config, R)(xmlText); 3285 } 3286 3287 /// 3288 unittest 3289 { 3290 import std.range.primitives : walkLength; 3291 3292 auto xml = "<?xml version='1.0'?>\n" ~ 3293 "<?instruction start?>\n" ~ 3294 "<foo attr='42'>\n" ~ 3295 " <bar/>\n" ~ 3296 " <!-- no comment -->\n" ~ 3297 " <baz hello='world'>\n" ~ 3298 " nothing to say.\n" ~ 3299 " nothing at all...\n" ~ 3300 " </baz>\n" ~ 3301 "</foo>\n" ~ 3302 "<?some foo?>"; 3303 3304 { 3305 auto range = parseXML(xml); 3306 assert(range.front.type == EntityType.pi); 3307 assert(range.front.name == "instruction"); 3308 assert(range.front.text == "start"); 3309 3310 range.popFront(); 3311 assert(range.front.type == EntityType.elementStart); 3312 assert(range.front.name == "foo"); 3313 3314 { 3315 auto attrs = range.front.attributes; 3316 assert(walkLength(attrs.save) == 1); 3317 assert(attrs.front.name == "attr"); 3318 assert(attrs.front.value == "42"); 3319 } 3320 3321 range.popFront(); 3322 assert(range.front.type == EntityType.elementEmpty); 3323 assert(range.front.name == "bar"); 3324 3325 range.popFront(); 3326 assert(range.front.type == EntityType.comment); 3327 assert(range.front.text == " no comment "); 3328 3329 range.popFront(); 3330 assert(range.front.type == EntityType.elementStart); 3331 assert(range.front.name == "baz"); 3332 3333 { 3334 auto attrs = range.front.attributes; 3335 assert(walkLength(attrs.save) == 1); 3336 assert(attrs.front.name == "hello"); 3337 assert(attrs.front.value == "world"); 3338 } 3339 3340 range.popFront(); 3341 assert(range.front.type == EntityType.text); 3342 assert(range.front.text == 3343 "\n nothing to say.\n nothing at all...\n "); 3344 3345 range.popFront(); 3346 assert(range.front.type == EntityType.elementEnd); // </baz> 3347 range.popFront(); 3348 assert(range.front.type == EntityType.elementEnd); // </foo> 3349 3350 range.popFront(); 3351 assert(range.front.type == EntityType.pi); 3352 assert(range.front.name == "some"); 3353 assert(range.front.text == "foo"); 3354 3355 range.popFront(); 3356 assert(range.empty); 3357 } 3358 { 3359 auto range = parseXML!simpleXML(xml); 3360 3361 // simpleXML is set to skip processing instructions. 3362 3363 assert(range.front.type == EntityType.elementStart); 3364 assert(range.front.name == "foo"); 3365 3366 { 3367 auto attrs = range.front.attributes; 3368 assert(walkLength(attrs.save) == 1); 3369 assert(attrs.front.name == "attr"); 3370 assert(attrs.front.value == "42"); 3371 } 3372 3373 // simpleXML is set to split empty tags so that <bar/> is treated 3374 // as the same as <bar></bar> so that code does not have to 3375 // explicitly handle empty tags. 3376 range.popFront(); 3377 assert(range.front.type == EntityType.elementStart); 3378 assert(range.front.name == "bar"); 3379 range.popFront(); 3380 assert(range.front.type == EntityType.elementEnd); 3381 assert(range.front.name == "bar"); 3382 3383 // simpleXML is set to skip comments. 3384 3385 range.popFront(); 3386 assert(range.front.type == EntityType.elementStart); 3387 assert(range.front.name == "baz"); 3388 3389 { 3390 auto attrs = range.front.attributes; 3391 assert(walkLength(attrs.save) == 1); 3392 assert(attrs.front.name == "hello"); 3393 assert(attrs.front.value == "world"); 3394 } 3395 3396 range.popFront(); 3397 assert(range.front.type == EntityType.text); 3398 assert(range.front.text == 3399 "\n nothing to say.\n nothing at all...\n "); 3400 3401 range.popFront(); 3402 assert(range.front.type == EntityType.elementEnd); // </baz> 3403 range.popFront(); 3404 assert(range.front.type == EntityType.elementEnd); // </foo> 3405 range.popFront(); 3406 assert(range.empty); 3407 } 3408 } 3409 3410 // Test the state of the range immediately after parseXML returns. 3411 unittest 3412 { 3413 import std.algorithm.comparison : equal; 3414 import dxml.internal : testRangeFuncs; 3415 3416 static foreach(func; testRangeFuncs) 3417 { 3418 static foreach(config; someTestConfigs) 3419 {{ 3420 auto range = parseXML!config("<?xml?><root></root>"); 3421 assert(!range.empty); 3422 assert(range.front.type == EntityType.elementStart); 3423 assert(equal(range.front.name, "root")); 3424 }} 3425 3426 static foreach(config; [Config.init, makeConfig(SkipPI.yes)]) 3427 {{ 3428 auto range = parseXML!config("<!--no comment--><root></root>"); 3429 assert(!range.empty); 3430 assert(range.front.type == EntityType.comment); 3431 assert(equal(range.front.text, "no comment")); 3432 }} 3433 static foreach(config; [simpleXML, makeConfig(SkipComments.yes)]) 3434 {{ 3435 auto range = parseXML!config("<!--no comment--><root></root>"); 3436 assert(!range.empty); 3437 assert(range.front.type == EntityType.elementStart); 3438 assert(equal(range.front.name, "root")); 3439 }} 3440 3441 static foreach(config; [Config.init, makeConfig(SkipComments.yes)]) 3442 {{ 3443 auto range = parseXML!config("<?private eye?><root></root>"); 3444 assert(!range.empty); 3445 assert(range.front.type == EntityType.pi); 3446 assert(equal(range.front.name, "private")); 3447 assert(equal(range.front.text, "eye")); 3448 }} 3449 static foreach(config; [simpleXML, makeConfig(SkipPI.yes)]) 3450 {{ 3451 auto range = parseXML!config("<?private eye?><root></root>"); 3452 assert(!range.empty); 3453 assert(range.front.type == EntityType.elementStart); 3454 assert(equal(range.front.name, "root")); 3455 }} 3456 3457 static foreach(config; someTestConfigs) 3458 {{ 3459 auto range = parseXML!config("<root></root>"); 3460 assert(!range.empty); 3461 assert(range.front.type == EntityType.elementStart); 3462 assert(equal(range.front.name, "root")); 3463 }} 3464 } 3465 } 3466 3467 // Test various invalid states that didn't seem to fit well into tests elsewhere. 3468 unittest 3469 { 3470 import core.exception : AssertError; 3471 import std.exception : collectException, enforce; 3472 import dxml.internal : testRangeFuncs; 3473 3474 static void testFail(alias func)(string text, int row, int col, size_t line = __LINE__) 3475 { 3476 auto xml = func(text); 3477 static foreach(config; someTestConfigs) 3478 {{ 3479 auto e = collectException!XMLParsingException( 3480 { 3481 auto range = parseXML!config(xml.save); 3482 while(!range.empty) 3483 range.popFront(); 3484 }()); 3485 enforce!AssertError(e !is null, "unittest failure 1", __FILE__, line); 3486 enforce!AssertError(e.pos == TextPos(row, col), "unittest failure 2", __FILE__, line); 3487 }} 3488 } 3489 3490 static foreach(func; testRangeFuncs) 3491 {{ 3492 testFail!func("<root></root><invalid></invalid>", 1, 15); 3493 testFail!func("<root></root><invalid/>", 1, 15); 3494 testFail!func("<root/><invalid></invalid>", 1, 9); 3495 testFail!func("<root/><invalid/>", 1, 9); 3496 3497 testFail!func("<root></root>invalid", 1, 14); 3498 testFail!func("<root/>invalid", 1, 8); 3499 3500 testFail!func("<root/><?pi?>invalid", 1, 14); 3501 testFail!func("<root/><?pi?><invalid/>", 1, 15); 3502 3503 testFail!func("<root/><!DOCTYPE foo>", 1, 9); 3504 testFail!func("<root/></root>", 1, 9); 3505 3506 testFail!func("invalid<root></root>", 1, 1); 3507 testFail!func("invalid<?xml?><root></root>", 1, 1); 3508 testFail!func("invalid<!DOCTYPE foo><root></root>", 1, 1); 3509 testFail!func("invalid<!--comment--><root></root>", 1, 1); 3510 testFail!func("invalid<?Poirot?><root></root>", 1, 1); 3511 3512 testFail!func("<?xml?>invalid<root></root>", 1, 8); 3513 testFail!func("<!DOCTYPE foo>invalid<root></root>", 1, 15); 3514 testFail!func("<!--comment-->invalid<root></root>", 1, 15); 3515 testFail!func("<?Poirot?>invalid<root></root>", 1, 11); 3516 3517 testFail!func("<?xml?>", 1, 8); 3518 testFail!func("<!DOCTYPE name>", 1, 16); 3519 testFail!func("<?Sherlock?>", 1, 13); 3520 testFail!func("<?Poirot?><?Sherlock?><?Holmes?>", 1, 33); 3521 testFail!func("<?Poirot?></Poirot>", 1, 12); 3522 testFail!func("</Poirot>", 1, 2); 3523 testFail!func("<", 1, 2); 3524 testFail!func(`</`, 1, 2); 3525 testFail!func(`</a`, 1, 2); 3526 testFail!func(`</a>`, 1, 2); 3527 3528 3529 testFail!func("<doc>]]></doc>", 1, 6); 3530 3531 testFail!func(" <?xml?><root/>", 1, 1); 3532 testFail!func("\n<?xml?><root/>", 1, 1); 3533 }} 3534 } 3535 3536 // Test that parseXML and EntityRange's properties work with @safe. 3537 // pure would be nice too, but at minimum, the use of format for exception 3538 // messages, and the use of assumeSafeAppend prevent it. It may or may not be 3539 // worth trying to fix that. 3540 @safe unittest 3541 { 3542 import std.algorithm.comparison : equal; 3543 import dxml.internal : testRangeFuncs; 3544 3545 auto xml = "<root>\n" ~ 3546 " <![CDATA[nothing]]>\n" ~ 3547 " <foo a='42'/>\n" ~ 3548 "</root>"; 3549 3550 static foreach(func; testRangeFuncs) 3551 {{ 3552 auto range = parseXML(xml); 3553 assert(range.front.type == EntityType.elementStart); 3554 assert(equal(range.front.name, "root")); 3555 range.popFront(); 3556 assert(!range.empty); 3557 assert(range.front.type == EntityType.cdata); 3558 assert(equal(range.front.text, "nothing")); 3559 range.popFront(); 3560 assert(!range.empty); 3561 assert(range.front.type == EntityType.elementEmpty); 3562 assert(equal(range.front.name, "foo")); 3563 { 3564 auto attrs = range.front.attributes; 3565 auto saved = attrs.save; 3566 auto attr = attrs.front; 3567 assert(attr.name == "a"); 3568 assert(attr.value == "42"); 3569 attrs.popFront(); 3570 assert(attrs.empty); 3571 } 3572 auto saved = range.save; 3573 }} 3574 } 3575 3576 3577 // This is purely to provide a way to trigger the unittest blocks in EntityRange 3578 // without compiling them in normally. 3579 struct EntityRangeCompileTests 3580 { 3581 @property bool empty() @safe pure nothrow @nogc { assert(0); } 3582 @property char front() @safe pure nothrow @nogc { assert(0); } 3583 void popFront() @safe pure nothrow @nogc { assert(0); } 3584 @property typeof(this) save() @safe pure nothrow @nogc { assert(0); } 3585 } 3586 3587 unittest 3588 { 3589 EntityRange!(Config.init, EntityRangeCompileTests) _entityRangeTests; 3590 } 3591 3592 3593 /++ 3594 Whether the given type is a forward range of attributes. 3595 3596 Essentially, an attribute range must be a forward range where 3597 3598 $(UL 3599 $(LI each element has the members $(D name), $(D value), and $(D pos)) 3600 $(LI $(D name) and $(D value) are forward ranges of characters) 3601 $(LI $(D name) and $(D value) have the same type) 3602 $(LI $(D pos) is a $(LREF TextPos))) 3603 3604 Normally, an attribute range would come from 3605 $(LREF EntityRange.Entity.attributes) or 3606 $(REF_ALTTEXT DOMEntity.attributes, DOMEntity.attributes, dxml, dom), but 3607 as long as a range has the correct API, it qualifies as an attribute range. 3608 3609 See_Also: $(LREF EntityRange.Entity.Attribute)$(BR) 3610 $(LREF EntityRange.Entity.attributes)$(BR) 3611 $(REF_ALTTEXT DOMEntity.Attribute, DOMEntity.Attribute, dxml, dom)$(BR) 3612 $(REF_ALTTEXT DOMEntity.attributes, DOMEntity.attributes, dxml, dom) 3613 +/ 3614 template isAttrRange(R) 3615 { 3616 static if(isForwardRange!R && 3617 is(typeof(R.init.front.name)) && 3618 is(typeof(R.init.front.value)) && 3619 is(ReturnType!((R r) => r.front.pos) == TextPos)) 3620 { 3621 alias NameType = ReturnType!((R r) => r.front.name); 3622 alias ValueType = ReturnType!((R r) => r.front.value); 3623 3624 enum isAttrRange = is(NameType == ValueType) && 3625 isForwardRange!NameType && 3626 isSomeChar!(ElementType!NameType); 3627 } 3628 else 3629 enum isAttrRange = false; 3630 } 3631 3632 /// 3633 unittest 3634 { 3635 import std.typecons : Tuple; 3636 import dxml.dom : parseDOM; 3637 3638 alias R1 = typeof(parseXML("<root/>").front.attributes); 3639 static assert(isAttrRange!R1); 3640 3641 alias R2 = typeof(parseDOM("<root/>").children[0].attributes); 3642 static assert(isAttrRange!R2); 3643 3644 alias T = Tuple!(string, "name", string, "value", TextPos, "pos"); 3645 static assert(isAttrRange!(T[])); 3646 3647 static assert(!isAttrRange!string); 3648 } 3649 3650 unittest 3651 { 3652 import std.typecons : Tuple; 3653 { 3654 alias T = Tuple!(string, "nam", string, "value", TextPos, "pos"); 3655 static assert(!isAttrRange!(T[])); 3656 } 3657 { 3658 alias T = Tuple!(string, "name", string, "valu", TextPos, "pos"); 3659 static assert(!isAttrRange!(T[])); 3660 } 3661 { 3662 alias T = Tuple!(string, "name", string, "value", TextPos, "po"); 3663 static assert(!isAttrRange!(T[])); 3664 } 3665 { 3666 alias T = Tuple!(string, "name", wstring, "value", TextPos, "pos"); 3667 static assert(!isAttrRange!(T[])); 3668 } 3669 { 3670 alias T = Tuple!(string, "name", string, "value"); 3671 static assert(!isAttrRange!(T[])); 3672 } 3673 { 3674 alias T = Tuple!(int, "name", string, "value", TextPos, "pos"); 3675 static assert(!isAttrRange!(T[])); 3676 } 3677 { 3678 alias T = Tuple!(string, "name", int, "value", TextPos, "pos"); 3679 static assert(!isAttrRange!(T[])); 3680 } 3681 { 3682 alias T = Tuple!(string, "name", string, "value", int, "pos"); 3683 static assert(!isAttrRange!(T[])); 3684 } 3685 } 3686 3687 3688 /++ 3689 A helper function for processing start tag attributes. 3690 3691 It functions similarly to $(PHOBOS_REF getopt, std, getopt). It takes a 3692 range of attributes and a list of alternating strings and pointers where 3693 each string represents the name of the attribute to parse and the pointer 3694 immediately after it is assigned the value that corresponds to the attribute 3695 name (if present). If the given pointer does not point to the same type as 3696 the range of characters used in the attributes, then 3697 $(PHOBOS_REF to, std, conv) is used to convert the value to the type the 3698 pointer points to. 3699 3700 If a $(D Nullable!T*) is given rather than a $(D T*), then it will be 3701 treated the same as if it had been $(D T*). So, $(D to!T) will be used to 3702 convert the attribute value if the matching attribute name is present. The 3703 advantage of passing $(D Nullable!T*) instead of $(D T*) is that it's 3704 possible to distinguish between an attribute that wasn't present and one 3705 where it was present but was equivalent to $(D T.init). 3706 3707 Unlike $(PHOBOS_REF getopt, std, getopt), the given range is consumed 3708 rather than taking it by $(K_REF) and leaving the attributes that weren't 3709 matched in the range (since that really doesn't work with an arbitrary 3710 range as opposed to a dynamic array). However, if the second argument of 3711 getAttrs is not a $(K_STRING) but is instead an output range that accepts 3712 the element type of the range, then any attributes which aren't matched are 3713 put into the output range. 3714 3715 Params: 3716 attrRange = A range of attributes (see $(LREF isAttrRange)). 3717 unmatched = An output range that any _unmatched attributes from the 3718 range are put into (optional argument). 3719 args = An alternating list of strings and pointers where the names 3720 represent the attribute names to get the value of, and the 3721 corresponding values get assigned to what the pointers point to. 3722 3723 Throws: $(LREF XMLParsingException) if $(PHOBOS_REF to, std, conv) fails to 3724 convert an attribute value. 3725 3726 See_Also: $(LREF isAttrRange)$(BR) 3727 $(LREF EntityRange.Entity.attributes)$(BR) 3728 $(REF_ALTTEXT DOMEntity.attributes, DOMEntity.attributes, dxml, dom) 3729 +/ 3730 void getAttrs(R, Args...)(R attrRange, Args args) 3731 if(isAttrRange!R && Args.length % 2 == 0) 3732 { 3733 mixin(_genGetAttrs(false)); 3734 } 3735 3736 /// Ditto 3737 void getAttrs(R, OR, Args...)(R attrRange, ref OR unmatched, Args args) 3738 if(isAttrRange!R && isOutputRange!(OR, ElementType!R) && Args.length % 2 == 0) 3739 { 3740 mixin(_genGetAttrs(true)); 3741 } 3742 3743 private string _genGetAttrs(bool includeUnmatched) 3744 { 3745 auto retval = 3746 ` import std.algorithm.comparison : equal; 3747 import std.conv : ConvException, to; 3748 import std.format : format; 3749 import std.typecons : Nullable; 3750 import std.utf : byChar; 3751 3752 alias Attr = ElementType!R; 3753 alias SliceOfR = ElementType!(typeof(Attr.init.name)); 3754 3755 outer: foreach(attr; attrRange) 3756 { 3757 static foreach(i, arg; args) 3758 { 3759 static if(i % 2 == 0) 3760 static assert(is(Args[i] == string), format!"Expected string for args[%s]"(i)); 3761 else 3762 { 3763 static assert(isPointer!(Args[i]), format!"Expected pointer for args[%s]"(i)); 3764 3765 if(equal(attr.name, args[i - 1].byChar())) 3766 { 3767 alias ArgType = typeof(*arg); 3768 3769 static if(isInstanceOf!(Nullable, ArgType)) 3770 alias TargetType = TemplateArgsOf!ArgType; 3771 else 3772 alias TargetType = typeof(*arg); 3773 3774 try 3775 *arg = to!TargetType(attr.value); 3776 catch(ConvException ce) 3777 { 3778 enum fmt = "Failed to convert %s: %s"; 3779 throw new XMLParsingException(format!fmt(attr.name, ce.msg), attr.pos); 3780 } 3781 3782 continue outer; 3783 } 3784 } 3785 }`; 3786 3787 if(includeUnmatched) 3788 retval ~= "\n put(unmatched, attr);"; 3789 retval ~= "\n }"; 3790 3791 return retval; 3792 } 3793 3794 unittest 3795 { 3796 import std.array : appender; 3797 import std.exception : collectException; 3798 import std.typecons : Nullable; 3799 3800 { 3801 auto xml = `<root a="foo" b="19" c="true" d="rocks"/>`; 3802 auto range = parseXML(xml); 3803 assert(range.front.type == EntityType.elementEmpty); 3804 3805 string a; 3806 int b; 3807 bool c; 3808 3809 getAttrs(range.front.attributes, "a", &a, "b", &b, "c", &c); 3810 assert(a == "foo"); 3811 assert(b == 19); 3812 assert(c == true); 3813 } 3814 3815 // Nullable!T* accepts the same as T*. 3816 { 3817 auto xml = `<root a="foo" c="true" d="rocks"/>`; 3818 auto range = parseXML(xml); 3819 assert(range.front.type == EntityType.elementEmpty); 3820 3821 Nullable!string a; 3822 Nullable!int b; 3823 bool c; 3824 3825 getAttrs(range.front.attributes, "c", &c, "b", &b, "a", &a); 3826 assert(a == "foo"); 3827 assert(b.isNull); 3828 assert(c == true); 3829 } 3830 3831 // If an output range of attributes is provided, then the ones that 3832 // weren't matched are put in it. 3833 { 3834 auto xml = `<root foo="42" bar="silly" d="rocks" q="t"/>`; 3835 auto range = parseXML(xml); 3836 assert(range.front.type == EntityType.elementEmpty); 3837 3838 alias Attribute = typeof(range).Entity.Attribute; 3839 auto unmatched = appender!(Attribute[])(); 3840 int i; 3841 string s; 3842 3843 getAttrs(range.front.attributes, unmatched, "foo", &i, "bar", &s); 3844 assert(i == 42); 3845 assert(s == "silly"); 3846 assert(unmatched.data.length == 2); 3847 assert(unmatched.data[0] == Attribute("d", "rocks", TextPos(1, 28))); 3848 assert(unmatched.data[1] == Attribute("q", "t", TextPos(1, 38))); 3849 } 3850 3851 // An XMLParsingException gets thrown if a conversion fails. 3852 { 3853 auto xml = `<root foo="bar" false="true" d="rocks"/>`; 3854 auto range = parseXML(xml); 3855 assert(range.front.type == EntityType.elementEmpty); 3856 3857 int i; 3858 3859 auto xpe = collectException!XMLParsingException( 3860 getAttrs(range.front.attributes, "d", &i)); 3861 assert(xpe.pos == TextPos(1, 30)); 3862 } 3863 3864 // Test parsing attributes with CTFE. 3865 enum dummy = (){ 3866 auto xml = `<root a="foo" d="rocks" c="true" b="19" />`; 3867 auto range = parseXML(xml); 3868 assert(range.front.type == EntityType.elementEmpty); 3869 3870 string a; 3871 int b; 3872 bool c; 3873 3874 getAttrs(range.front.attributes, "a", &a, "b", &b, "c", &c); 3875 assert(a == "foo"); 3876 assert(b == 19); 3877 assert(c == true); 3878 return 0; 3879 }(); 3880 } 3881 3882 unittest 3883 { 3884 auto range = parseXML("<root/>"); 3885 auto attrs = range.front.attributes; 3886 int i; 3887 static assert(!__traits(compiles, getAttrs(attrs, "foo"))); 3888 static assert(!__traits(compiles, getAttrs(attrs, "foo", "bar"))); 3889 static assert(!__traits(compiles, getAttrs(attrs, "foo", "bar", &i))); 3890 static assert(!__traits(compiles, getAttrs(attrs, "foo", "bar", &i, &i))); 3891 static assert(!__traits(compiles, getAttrs(attrs, &i, "foo"))); 3892 static assert(!__traits(compiles, getAttrs(attrs, &i, "foo", &i))); 3893 static assert(!__traits(compiles, getAttrs(attrs, &i, "foo", &i, "bar"))); 3894 } 3895 3896 @safe pure unittest 3897 { 3898 import std.typecons : Nullable; 3899 3900 static test(R)(R range, int* i, Nullable!int* j) @safe pure 3901 { 3902 getAttrs(range.front.attributes, "foo", i, "bar", j); 3903 } 3904 3905 test(parseXML("<root/>"), null, null); 3906 } 3907 3908 3909 /++ 3910 Takes an $(LREF EntityRange) which is at a start tag and iterates it until 3911 it is at its corresponding end tag. It is an error to call skipContents when 3912 the current entity is not $(LREF EntityType.elementStart). 3913 3914 $(TABLE 3915 $(TR $(TH Supported $(LREF EntityType)s:)) 3916 $(TR $(TD $(LREF2 elementStart, EntityType))) 3917 ) 3918 3919 Returns: The range with its $(D front) now at the end tag corresponding to 3920 the start tag that was $(D front) when the function was called. 3921 3922 Throws: $(LREF XMLParsingException) on invalid XML. 3923 +/ 3924 R skipContents(R)(R entityRange) 3925 if(isInstanceOf!(EntityRange, R)) 3926 { 3927 assert(entityRange._type == EntityType.elementStart); 3928 3929 // We don't bother calling empty, because the only way for the entityRange 3930 // to be empty would be for it to reach the end of the document, and an 3931 // XMLParsingException would be thrown if the end of the document were 3932 // reached before we reached the corresponding end tag. 3933 for(int tagDepth = 1; tagDepth != 0;) 3934 { 3935 entityRange.popFront(); 3936 immutable type = entityRange._type; 3937 if(type == EntityType.elementStart) 3938 ++tagDepth; 3939 else if(type == EntityType.elementEnd) 3940 --tagDepth; 3941 } 3942 3943 return entityRange; 3944 } 3945 3946 /// 3947 unittest 3948 { 3949 auto xml = "<root>\n" ~ 3950 " <foo>\n" ~ 3951 " <bar>\n" ~ 3952 " Some text\n" ~ 3953 " </bar>\n" ~ 3954 " </foo>\n" ~ 3955 " <!-- no comment -->\n" ~ 3956 "</root>"; 3957 3958 auto range = parseXML(xml); 3959 assert(range.front.type == EntityType.elementStart); 3960 assert(range.front.name == "root"); 3961 3962 range.popFront(); 3963 assert(range.front.type == EntityType.elementStart); 3964 assert(range.front.name == "foo"); 3965 3966 range = range.skipContents(); 3967 assert(range.front.type == EntityType.elementEnd); 3968 assert(range.front.name == "foo"); 3969 3970 range.popFront(); 3971 assert(range.front.type == EntityType.comment); 3972 assert(range.front.text == " no comment "); 3973 3974 range.popFront(); 3975 assert(range.front.type == EntityType.elementEnd); 3976 assert(range.front.name == "root"); 3977 3978 range.popFront(); 3979 assert(range.empty); 3980 } 3981 3982 3983 /++ 3984 Skips entities until the given $(LREF EntityType) is reached. 3985 3986 If multiple $(LREF EntityType)s are given, then any one of them counts as 3987 a match. 3988 3989 The current entity is skipped regardless of whether it is the given 3990 $(LREF EntityType). 3991 3992 This is essentially a slightly optimized equivalent to 3993 3994 --- 3995 if(!range.empty()) 3996 { 3997 range.popFront(); 3998 range = range.find!((a, b) => a.type == b.type)(entityTypes); 3999 } 4000 --- 4001 4002 Returns: The given range with its $(D front) now at the first entity which 4003 matched one of the given $(LREF EntityType)s or an empty range if 4004 none were found. 4005 4006 Throws: $(LREF XMLParsingException) on invalid XML. 4007 +/ 4008 R skipToEntityType(R)(R entityRange, EntityType[] entityTypes...) 4009 if(isInstanceOf!(EntityRange, R)) 4010 { 4011 if(entityRange.empty) 4012 return entityRange; 4013 entityRange.popFront(); 4014 for(; !entityRange.empty; entityRange.popFront()) 4015 { 4016 immutable type = entityRange._type; 4017 foreach(entityType; entityTypes) 4018 { 4019 if(type == entityType) 4020 return entityRange; 4021 } 4022 } 4023 return entityRange; 4024 } 4025 4026 /// 4027 unittest 4028 { 4029 auto xml = "<root>\n" ~ 4030 " <!-- blah blah blah -->\n" ~ 4031 " <foo>nothing to say</foo>\n" ~ 4032 "</root>"; 4033 4034 auto range = parseXML(xml); 4035 assert(range.front.type == EntityType.elementStart); 4036 assert(range.front.name == "root"); 4037 4038 range = range.skipToEntityType(EntityType.elementStart, 4039 EntityType.elementEmpty); 4040 assert(range.front.type == EntityType.elementStart); 4041 assert(range.front.name == "foo"); 4042 4043 assert(range.skipToEntityType(EntityType.comment).empty); 4044 4045 // skipToEntityType will work on an empty range but will always 4046 // return an empty range. 4047 assert(range.takeNone().skipToEntityType(EntityType.comment).empty); 4048 } 4049 4050 4051 /++ 4052 Skips entities until the end tag is reached that corresponds to the start 4053 tag that is the parent of the current entity. 4054 4055 Returns: The given range with its $(D front) now at the end tag which 4056 corresponds to the parent start tag of the entity that was 4057 $(D front) when skipToParentEndTag was called. If the current 4058 entity does not have a parent start tag (which means that it's 4059 either the root element or a comment or PI outside of the root 4060 element), then an empty range is returned. 4061 4062 Throws: $(LREF XMLParsingException) on invalid XML. 4063 +/ 4064 R skipToParentEndTag(R)(R entityRange) 4065 if(isInstanceOf!(EntityRange, R)) 4066 { 4067 with(EntityType) final switch(entityRange._type) 4068 { 4069 case cdata: 4070 case comment: 4071 { 4072 entityRange = entityRange.skipToEntityType(elementStart, elementEnd); 4073 if(entityRange.empty || entityRange._type == elementEnd) 4074 return entityRange; 4075 goto case elementStart; 4076 } 4077 case elementStart: 4078 { 4079 while(true) 4080 { 4081 entityRange = entityRange.skipContents(); 4082 entityRange.popFront(); 4083 if(entityRange.empty || entityRange._type == elementEnd) 4084 return entityRange; 4085 if(entityRange._type == elementStart) 4086 continue; 4087 goto case comment; 4088 } 4089 assert(0); // the compiler isn't smart enough to see that this is unreachable. 4090 } 4091 case elementEnd: 4092 case elementEmpty: 4093 case pi: 4094 case text: goto case comment; 4095 } 4096 } 4097 4098 /// 4099 unittest 4100 { 4101 auto xml = "<root>\n" ~ 4102 " <foo>\n" ~ 4103 " <!-- comment -->\n" ~ 4104 " <bar>exam</bar>\n" ~ 4105 " </foo>\n" ~ 4106 " <!-- another comment -->\n" ~ 4107 "</root>"; 4108 { 4109 auto range = parseXML(xml); 4110 assert(range.front.type == EntityType.elementStart); 4111 assert(range.front.name == "root"); 4112 4113 range.popFront(); 4114 assert(range.front.type == EntityType.elementStart); 4115 assert(range.front.name == "foo"); 4116 4117 range.popFront(); 4118 assert(range.front.type == EntityType.comment); 4119 assert(range.front.text == " comment "); 4120 4121 range = range.skipToParentEndTag(); 4122 assert(range.front.type == EntityType.elementEnd); 4123 assert(range.front.name == "foo"); 4124 4125 range = range.skipToParentEndTag(); 4126 assert(range.front.type == EntityType.elementEnd); 4127 assert(range.front.name == "root"); 4128 4129 range = range.skipToParentEndTag(); 4130 assert(range.empty); 4131 } 4132 { 4133 auto range = parseXML(xml); 4134 assert(range.front.type == EntityType.elementStart); 4135 assert(range.front.name == "root"); 4136 4137 range.popFront(); 4138 assert(range.front.type == EntityType.elementStart); 4139 assert(range.front.name == "foo"); 4140 4141 range.popFront(); 4142 assert(range.front.type == EntityType.comment); 4143 assert(range.front.text == " comment "); 4144 4145 range.popFront(); 4146 assert(range.front.type == EntityType.elementStart); 4147 assert(range.front.name == "bar"); 4148 4149 range.popFront(); 4150 assert(range.front.type == EntityType.text); 4151 assert(range.front.text == "exam"); 4152 4153 range = range.skipToParentEndTag(); 4154 assert(range.front.type == EntityType.elementEnd); 4155 assert(range.front.name == "bar"); 4156 4157 range = range.skipToParentEndTag(); 4158 assert(range.front.type == EntityType.elementEnd); 4159 assert(range.front.name == "foo"); 4160 4161 range.popFront(); 4162 assert(range.front.type == EntityType.comment); 4163 assert(range.front.text == " another comment "); 4164 4165 range = range.skipToParentEndTag(); 4166 assert(range.front.type == EntityType.elementEnd); 4167 assert(range.front.name == "root"); 4168 4169 assert(range.skipToParentEndTag().empty); 4170 } 4171 { 4172 auto range = parseXML("<root><foo>bar</foo></root>"); 4173 assert(range.front.type == EntityType.elementStart); 4174 assert(range.front.name == "root"); 4175 assert(range.skipToParentEndTag().empty); 4176 } 4177 } 4178 4179 unittest 4180 { 4181 import core.exception : AssertError; 4182 import std.algorithm.comparison : equal; 4183 import std.exception : enforce; 4184 import dxml.internal : testRangeFuncs; 4185 4186 static void popAndCheck(R)(ref R range, EntityType type, size_t line = __LINE__) 4187 { 4188 range.popFront(); 4189 enforce!AssertError(!range.empty, "unittest 1", __FILE__, line); 4190 enforce!AssertError(range.front.type == type, "unittest 2", __FILE__, line); 4191 } 4192 4193 static foreach(func; testRangeFuncs) 4194 {{ 4195 // cdata 4196 { 4197 auto xml = "<root>\n" ~ 4198 " <![CDATA[ cdata run ]]>\n" ~ 4199 " <nothing/>\n" ~ 4200 " <![CDATA[ cdata have its bits flipped ]]>\n" ~ 4201 " <foo></foo>\n" ~ 4202 " <![CDATA[ cdata play violin ]]>\n" ~ 4203 "</root>"; 4204 4205 auto range = parseXML(func(xml)); 4206 assert(range.front.type == EntityType.elementStart); 4207 popAndCheck(range, EntityType.cdata); 4208 assert(equal(range.front.text, " cdata run ")); 4209 { 4210 auto temp = range.save.skipToParentEndTag(); 4211 assert(temp._type == EntityType.elementEnd); 4212 assert(equal(temp.front.name, "root")); 4213 } 4214 popAndCheck(range, EntityType.elementEmpty); 4215 popAndCheck(range, EntityType.cdata); 4216 assert(equal(range.front.text, " cdata have its bits flipped ")); 4217 { 4218 auto temp = range.save.skipToParentEndTag(); 4219 assert(temp._type == EntityType.elementEnd); 4220 assert(equal(temp.front.name, "root")); 4221 } 4222 popAndCheck(range, EntityType.elementStart); 4223 range = range.skipContents(); 4224 popAndCheck(range, EntityType.cdata); 4225 assert(equal(range.front.text, " cdata play violin ")); 4226 range = range.skipToParentEndTag(); 4227 assert(range._type == EntityType.elementEnd); 4228 assert(equal(range.front.name, "root")); 4229 } 4230 // comment 4231 { 4232 auto xml = "<!-- before -->\n" ~ 4233 "<root>\n" ~ 4234 " <!-- comment 1 -->\n" ~ 4235 " <nothing/>\n" ~ 4236 " <!-- comment 2 -->\n" ~ 4237 " <foo></foo>\n" ~ 4238 " <!-- comment 3 -->\n" ~ 4239 "</root>\n" ~ 4240 "<!-- after -->" ~ 4241 "<!-- end -->"; 4242 4243 auto text = func(xml); 4244 assert(parseXML(text.save).skipToParentEndTag().empty); 4245 { 4246 auto range = parseXML(text.save); 4247 assert(range.front.type == EntityType.comment); 4248 popAndCheck(range, EntityType.elementStart); 4249 popAndCheck(range, EntityType.comment); 4250 assert(equal(range.front.text, " comment 1 ")); 4251 { 4252 auto temp = range.save.skipToParentEndTag(); 4253 assert(temp._type == EntityType.elementEnd); 4254 assert(equal(temp.front.name, "root")); 4255 } 4256 popAndCheck(range, EntityType.elementEmpty); 4257 popAndCheck(range, EntityType.comment); 4258 assert(equal(range.front.text, " comment 2 ")); 4259 { 4260 auto temp = range.save.skipToParentEndTag(); 4261 assert(temp._type == EntityType.elementEnd); 4262 assert(equal(temp.front.name, "root")); 4263 } 4264 popAndCheck(range, EntityType.elementStart); 4265 range = range.skipContents(); 4266 popAndCheck(range, EntityType.comment); 4267 assert(equal(range.front.text, " comment 3 ")); 4268 range = range.skipToParentEndTag(); 4269 assert(range._type == EntityType.elementEnd); 4270 assert(equal(range.front.name, "root")); 4271 } 4272 { 4273 auto range = parseXML(text.save); 4274 assert(range.front.type == EntityType.comment); 4275 popAndCheck(range, EntityType.elementStart); 4276 range = range.skipContents(); 4277 popAndCheck(range, EntityType.comment); 4278 assert(equal(range.front.text, " after ")); 4279 assert(range.save.skipToParentEndTag().empty); 4280 popAndCheck(range, EntityType.comment); 4281 assert(equal(range.front.text, " end ")); 4282 assert(range.skipToParentEndTag().empty); 4283 } 4284 } 4285 // elementStart 4286 { 4287 auto xml = "<root>\n" ~ 4288 " <a><b>foo</b></a>\n" ~ 4289 " <nothing/>\n" ~ 4290 " <c></c>\n" ~ 4291 " <d>\n" ~ 4292 " <e>\n" ~ 4293 " </e>\n" ~ 4294 " <f>\n" ~ 4295 " <g>\n" ~ 4296 " </g>\n" ~ 4297 " </f>\n" ~ 4298 " </d>\n" ~ 4299 "</root>"; 4300 4301 auto range = parseXML(func(xml)); 4302 assert(range.front.type == EntityType.elementStart); 4303 assert(equal(range.front.name, "root")); 4304 assert(range.save.skipToParentEndTag().empty); 4305 popAndCheck(range, EntityType.elementStart); 4306 assert(equal(range.front.name, "a")); 4307 { 4308 auto temp = range.save.skipToParentEndTag(); 4309 assert(temp._type == EntityType.elementEnd); 4310 assert(equal(temp.front.name, "root")); 4311 } 4312 popAndCheck(range, EntityType.elementStart); 4313 assert(equal(range.front.name, "b")); 4314 { 4315 auto temp = range.save.skipToParentEndTag(); 4316 assert(temp._type == EntityType.elementEnd); 4317 assert(equal(temp.front.name, "a")); 4318 } 4319 popAndCheck(range, EntityType.text); 4320 popAndCheck(range, EntityType.elementEnd); 4321 popAndCheck(range, EntityType.elementEnd); 4322 popAndCheck(range, EntityType.elementEmpty); 4323 popAndCheck(range, EntityType.elementStart); 4324 assert(equal(range.front.name, "c")); 4325 { 4326 auto temp = range.save.skipToParentEndTag(); 4327 assert(temp._type == EntityType.elementEnd); 4328 assert(equal(temp.front.name, "root")); 4329 } 4330 popAndCheck(range, EntityType.elementEnd); 4331 popAndCheck(range, EntityType.elementStart); 4332 assert(equal(range.front.name, "d")); 4333 popAndCheck(range, EntityType.elementStart); 4334 assert(equal(range.front.name, "e")); 4335 range = range.skipToParentEndTag(); 4336 assert(range._type == EntityType.elementEnd); 4337 assert(equal(range.front.name, "d")); 4338 range = range.skipToParentEndTag(); 4339 assert(range._type == EntityType.elementEnd); 4340 assert(equal(range.front.name, "root")); 4341 } 4342 // elementEnd 4343 { 4344 auto xml = "<root>\n" ~ 4345 " <a><b>foo</b></a>\n" ~ 4346 " <nothing/>\n" ~ 4347 " <c></c>\n" ~ 4348 "</root>"; 4349 4350 auto range = parseXML(func(xml)); 4351 assert(range.front.type == EntityType.elementStart); 4352 popAndCheck(range, EntityType.elementStart); 4353 popAndCheck(range, EntityType.elementStart); 4354 popAndCheck(range, EntityType.text); 4355 popAndCheck(range, EntityType.elementEnd); 4356 assert(equal(range.front.name, "b")); 4357 { 4358 auto temp = range.save.skipToParentEndTag(); 4359 assert(temp._type == EntityType.elementEnd); 4360 assert(equal(temp.front.name, "a")); 4361 } 4362 popAndCheck(range, EntityType.elementEnd); 4363 assert(equal(range.front.name, "a")); 4364 { 4365 auto temp = range.save.skipToParentEndTag(); 4366 assert(temp._type == EntityType.elementEnd); 4367 assert(equal(temp.front.name, "root")); 4368 } 4369 popAndCheck(range, EntityType.elementEmpty); 4370 popAndCheck(range, EntityType.elementStart); 4371 popAndCheck(range, EntityType.elementEnd); 4372 assert(equal(range.front.name, "c")); 4373 { 4374 auto temp = range.save.skipToParentEndTag(); 4375 assert(temp._type == EntityType.elementEnd); 4376 assert(equal(temp.front.name, "root")); 4377 } 4378 popAndCheck(range, EntityType.elementEnd); 4379 assert(range.skipToParentEndTag().empty); 4380 } 4381 // elementEmpty 4382 { 4383 auto range = parseXML(func("<root/>")); 4384 assert(range.front.type == EntityType.elementEmpty); 4385 assert(range.skipToParentEndTag().empty); 4386 } 4387 { 4388 auto xml = "<root>\n" ~ 4389 " <a><b>foo</b></a>\n" ~ 4390 " <nothing/>\n" ~ 4391 " <c></c>\n" ~ 4392 " <whatever/>\n" ~ 4393 "</root>"; 4394 4395 auto range = parseXML(func(xml)); 4396 popAndCheck(range, EntityType.elementStart); 4397 assert(range.front.type == EntityType.elementStart); 4398 range = range.skipContents(); 4399 popAndCheck(range, EntityType.elementEmpty); 4400 assert(equal(range.front.name, "nothing")); 4401 { 4402 auto temp = range.save; 4403 popAndCheck(temp, EntityType.elementStart); 4404 popAndCheck(temp, EntityType.elementEnd); 4405 popAndCheck(temp, EntityType.elementEmpty); 4406 assert(equal(temp.front.name, "whatever")); 4407 } 4408 range = range.skipToParentEndTag(); 4409 assert(range._type == EntityType.elementEnd); 4410 assert(equal(range.front.name, "root")); 4411 } 4412 // pi 4413 { 4414 auto xml = "<?Sherlock?>\n" ~ 4415 "<root>\n" ~ 4416 " <?Foo?>\n" ~ 4417 " <nothing/>\n" ~ 4418 " <?Bar?>\n" ~ 4419 " <foo></foo>\n" ~ 4420 " <?Baz?>\n" ~ 4421 "</root>\n" ~ 4422 "<?Poirot?>\n" ~ 4423 "<?Conan?>"; 4424 4425 auto range = parseXML(func(xml)); 4426 assert(range.front.type == EntityType.pi); 4427 assert(equal(range.front.name, "Sherlock")); 4428 assert(range.save.skipToParentEndTag().empty); 4429 popAndCheck(range, EntityType.elementStart); 4430 popAndCheck(range, EntityType.pi); 4431 assert(equal(range.front.name, "Foo")); 4432 { 4433 auto temp = range.save.skipToParentEndTag(); 4434 assert(temp._type == EntityType.elementEnd); 4435 assert(equal(temp.front.name, "root")); 4436 } 4437 popAndCheck(range, EntityType.elementEmpty); 4438 popAndCheck(range, EntityType.pi); 4439 assert(equal(range.front.name, "Bar")); 4440 { 4441 auto temp = range.save.skipToParentEndTag(); 4442 assert(temp._type == EntityType.elementEnd); 4443 assert(equal(temp.front.name, "root")); 4444 } 4445 popAndCheck(range, EntityType.elementStart); 4446 popAndCheck(range, EntityType.elementEnd); 4447 popAndCheck(range, EntityType.pi); 4448 assert(equal(range.front.name, "Baz")); 4449 range = range.skipToParentEndTag(); 4450 assert(range._type == EntityType.elementEnd); 4451 assert(equal(range.front.name, "root")); 4452 popAndCheck(range, EntityType.pi); 4453 assert(equal(range.front.name, "Poirot")); 4454 assert(range.save.skipToParentEndTag().empty); 4455 popAndCheck(range, EntityType.pi); 4456 assert(equal(range.front.name, "Conan")); 4457 assert(range.skipToParentEndTag().empty); 4458 } 4459 // text 4460 { 4461 auto xml = "<root>\n" ~ 4462 " nothing to say\n" ~ 4463 " <nothing/>\n" ~ 4464 " nothing whatsoever\n" ~ 4465 " <foo></foo>\n" ~ 4466 " but he keeps talking\n" ~ 4467 "</root>"; 4468 4469 auto range = parseXML(func(xml)); 4470 assert(range.front.type == EntityType.elementStart); 4471 popAndCheck(range, EntityType.text); 4472 assert(equal(range.front.text, "\n nothing to say\n ")); 4473 { 4474 auto temp = range.save.skipToParentEndTag(); 4475 assert(temp._type == EntityType.elementEnd); 4476 assert(equal(temp.front.name, "root")); 4477 } 4478 popAndCheck(range, EntityType.elementEmpty); 4479 popAndCheck(range, EntityType.text); 4480 assert(equal(range.front.text, "\n nothing whatsoever\n ")); 4481 { 4482 auto temp = range.save.skipToParentEndTag(); 4483 assert(temp._type == EntityType.elementEnd); 4484 assert(equal(temp.front.name, "root")); 4485 } 4486 popAndCheck(range, EntityType.elementStart); 4487 range = range.skipContents(); 4488 popAndCheck(range, EntityType.text); 4489 assert(equal(range.front.text, "\n but he keeps talking\n")); 4490 range = range.skipToParentEndTag(); 4491 assert(range._type == EntityType.elementEnd); 4492 assert(equal(range.front.name, "root")); 4493 } 4494 }} 4495 } 4496 4497 4498 /++ 4499 Treats the given string like a file path except that each directory 4500 corresponds to the name of a start tag. Note that this does $(I not) try to 4501 implement XPath as that would be quite complicated, and it really doesn't 4502 fit with a StAX parser. 4503 4504 A start tag should be thought of as a directory, with its child start tags 4505 as the directories it contains. 4506 4507 All paths should be relative. $(LREF EntityRange) can only move forward 4508 through the document, so using an absolute path would only make sense at 4509 the beginning of the document. As such, absolute paths are treated as 4510 invalid paths. 4511 4512 $(D_CODE_STRING "./") and $(D_CODE_STRING "../") are supported. Repeated 4513 slashes such as in $(D_CODE_STRING "foo//bar") are not supported and are 4514 treated as an invalid path. 4515 4516 If $(D range.front.type == EntityType.elementStart), then 4517 $(D range._skiptoPath($(D_STRING "foo"))) will search for the first child 4518 start tag (be it $(LREF EntityType.elementStart) or 4519 $(LREF EntityType.elementEmpty)) with the $(LREF2 name, EntityRange.Entity) 4520 $(D_CODE_STRING "foo"). That start tag must be a direct child of the current 4521 start tag. 4522 4523 If $(D range.front.type) is any other $(LREF EntityType), then 4524 $(D range._skipToPath($(D_STRING "foo"))) will return an empty range, 4525 because no other $(LREF EntityType)s have child start tags. 4526 4527 For any $(LREF EntityType), $(D range._skipToPath($(D_STRING "../foo"))) 4528 will search for the first start tag with the 4529 $(LREF2 name, EntityRange.Entity) $(D_CODE_STRING "foo") at the same level 4530 as the current entity. If the current entity is a start tag with the name 4531 $(D_CODE_STRING "foo"), it will not be considered a match. 4532 4533 $(D range._skipToPath($(D_STRING "./"))) is a no-op. However, 4534 $(D range._skipToPath($(D_STRING "../"))) will result in the empty range 4535 (since it doesn't target a specific start tag). 4536 4537 $(D range._skipToPath($(D_STRING "foo/bar"))) is equivalent to 4538 $(D range._skipToPath($(D_STRING "foo"))._skipToPath($(D_STRING "bar"))), 4539 and $(D range._skipToPath($(D_STRING "../foo/bar"))) is equivalent to 4540 $(D range._skipToPath($(D_STRING "../foo"))._skipToPath($(D_STRING "bar"))). 4541 4542 Returns: The given range with its $(D front) now at the requested entity if 4543 the path is valid; otherwise, an empty range is returned. 4544 4545 Throws: $(LREF XMLParsingException) on invalid XML. 4546 +/ 4547 R skipToPath(R)(R entityRange, string path) 4548 if(isInstanceOf!(EntityRange, R)) 4549 { 4550 import std.algorithm.comparison : equal; 4551 import std.path : pathSplitter; 4552 4553 if(entityRange.empty) 4554 return entityRange; 4555 if(path.empty || path[0] == '/') 4556 return entityRange.takeNone(); 4557 4558 with(EntityType) 4559 { 4560 static if(R.config.splitEmpty == SplitEmpty.yes) 4561 EntityType[2] startOrEnd = [elementStart, elementEnd]; 4562 else 4563 EntityType[3] startOrEnd = [elementStart, elementEnd, elementEmpty]; 4564 4565 R findOnCurrLevel(string name) 4566 { 4567 if(entityRange._type == elementStart) 4568 entityRange = entityRange.skipContents(); 4569 while(true) 4570 { 4571 entityRange = entityRange.skipToEntityType(startOrEnd[]); 4572 if(entityRange.empty) 4573 return entityRange; 4574 if(entityRange._type == elementEnd) 4575 return entityRange.takeNone(); 4576 4577 if(equal(name, entityRange._name.save)) 4578 return entityRange; 4579 4580 static if(R.config.splitEmpty == SplitEmpty.no) 4581 { 4582 if(entityRange._type == elementEmpty) 4583 continue; 4584 } 4585 entityRange = entityRange.skipContents(); 4586 } 4587 } 4588 4589 for(auto pieces = path.pathSplitter(); !pieces.empty; pieces.popFront()) 4590 { 4591 if(pieces.front == ".") 4592 continue; 4593 else if(pieces.front == "..") 4594 { 4595 pieces.popFront(); 4596 if(pieces.empty) 4597 return entityRange.takeNone(); 4598 4599 while(pieces.front == "..") 4600 { 4601 pieces.popFront(); 4602 if(pieces.empty) 4603 return entityRange.takeNone(); 4604 entityRange = entityRange.skipToParentEndTag(); 4605 if(entityRange.empty) 4606 return entityRange; 4607 } 4608 4609 entityRange = findOnCurrLevel(pieces.front); 4610 if(entityRange.empty) 4611 return entityRange; 4612 } 4613 else 4614 { 4615 if(entityRange._type != elementStart) 4616 return entityRange.takeNone(); 4617 4618 entityRange = entityRange.skipToEntityType(startOrEnd[]); 4619 assert(!entityRange.empty); 4620 if(entityRange._type == elementEnd) 4621 return entityRange.takeNone(); 4622 4623 if(!equal(pieces.front, entityRange._name.save)) 4624 { 4625 entityRange = findOnCurrLevel(pieces.front); 4626 if(entityRange.empty) 4627 return entityRange; 4628 } 4629 } 4630 } 4631 4632 return entityRange; 4633 } 4634 } 4635 4636 /// 4637 unittest 4638 { 4639 { 4640 auto xml = "<carrot>\n" ~ 4641 " <foo>\n" ~ 4642 " <bar>\n" ~ 4643 " <baz/>\n" ~ 4644 " <other/>\n" ~ 4645 " </bar>\n" ~ 4646 " </foo>\n" ~ 4647 "</carrot>"; 4648 4649 auto range = parseXML(xml); 4650 // "<carrot>" 4651 assert(range.front.type == EntityType.elementStart); 4652 assert(range.front.name == "carrot"); 4653 4654 range = range.skipToPath("foo/bar"); 4655 // " <bar> 4656 assert(!range.empty); 4657 assert(range.front.type == EntityType.elementStart); 4658 assert(range.front.name == "bar"); 4659 4660 range = range.skipToPath("baz"); 4661 // " <baz/> 4662 assert(!range.empty); 4663 assert(range.front.type == EntityType.elementEmpty); 4664 4665 // other is not a child element of baz 4666 assert(range.skipToPath("other").empty); 4667 4668 range = range.skipToPath("../other"); 4669 // " <other/>" 4670 assert(!range.empty); 4671 assert(range.front.type == EntityType.elementEmpty); 4672 } 4673 { 4674 auto xml = "<potato>\n" ~ 4675 " <foo>\n" ~ 4676 " <bar>\n "~ 4677 " </bar>\n" ~ 4678 " <crazy>\n" ~ 4679 " </crazy>\n" ~ 4680 " <fou/>\n" ~ 4681 " </foo>\n" ~ 4682 " <buzz/>\n" ~ 4683 "</potato>"; 4684 4685 auto range = parseXML(xml); 4686 // "<potato>" 4687 assert(range.front.type == EntityType.elementStart); 4688 4689 range = range.skipToPath("./"); 4690 // "<potato>" 4691 assert(!range.empty); 4692 assert(range.front.type == EntityType.elementStart); 4693 assert(range.front.name == "potato"); 4694 4695 range = range.skipToPath("./foo/bar"); 4696 // " <bar>" 4697 assert(!range.empty); 4698 assert(range.front.type == EntityType.elementStart); 4699 assert(range.front.name == "bar"); 4700 4701 range = range.skipToPath("../crazy"); 4702 // " <crazy>" 4703 assert(!range.empty); 4704 assert(range.front.type == EntityType.elementStart); 4705 assert(range.front.name == "crazy"); 4706 4707 // Whether popFront is called here before the call to 4708 // range.skipToPath("../fou") below, the result is the same, because 4709 // both <crazy> and </crazy> are at the same level. 4710 range.popFront(); 4711 // " </crazy>" 4712 assert(!range.empty); 4713 assert(range.front.type == EntityType.elementEnd); 4714 assert(range.front.name == "crazy"); 4715 4716 range = range.skipToPath("../fou"); 4717 // " <fou/>" 4718 assert(!range.empty); 4719 assert(range.front.type == EntityType.elementEmpty); 4720 } 4721 // Searching stops at the first matching start tag. 4722 { 4723 auto xml = "<beet>\n" ~ 4724 " <foo a='42'>\n" ~ 4725 " </foo>\n" ~ 4726 " <foo b='451'>\n" ~ 4727 " </foo>\n" ~ 4728 "</beet>"; 4729 4730 auto range = parseXML(xml); 4731 range = range.skipToPath("foo"); 4732 assert(!range.empty); 4733 assert(range.front.type == EntityType.elementStart); 4734 assert(range.front.name == "foo"); 4735 4736 { 4737 auto attrs = range.front.attributes; 4738 assert(attrs.front.name == "a"); 4739 assert(attrs.front.value == "42"); 4740 } 4741 4742 range = range.skipToPath("../foo"); 4743 assert(!range.empty); 4744 assert(range.front.type == EntityType.elementStart); 4745 assert(range.front.name == "foo"); 4746 4747 { 4748 auto attrs = range.front.attributes; 4749 assert(attrs.front.name == "b"); 4750 assert(attrs.front.value == "451"); 4751 } 4752 } 4753 // skipToPath will work on an empty range but will always return an 4754 // empty range. 4755 { 4756 auto range = parseXML("<root/>"); 4757 assert(range.takeNone().skipToPath("nowhere").empty); 4758 } 4759 // Empty and absolute paths will also result in an empty range as will 4760 // "../" without any actual tag name on the end. 4761 { 4762 auto range = parseXML("<root/>"); 4763 assert(range.skipToPath("").empty); 4764 assert(range.skipToPath("/").empty); 4765 assert(range.skipToPath("../").empty); 4766 } 4767 // Only non-empty start tags have children; all other EntityTypes result 4768 // in an empty range unless "../" is used. 4769 { 4770 auto xml = "<!-- comment -->\n" ~ 4771 "<root>\n" ~ 4772 " <foo/>\n" ~ 4773 "</root>"; 4774 auto range = parseXML(xml); 4775 assert(range.skipToPath("root").empty); 4776 assert(range.skipToPath("foo").empty); 4777 4778 range = range.skipToPath("../root"); 4779 assert(!range.empty); 4780 assert(range.front.type == EntityType.elementStart); 4781 assert(range.front.name == "root"); 4782 } 4783 } 4784 4785 unittest 4786 { 4787 import core.exception : AssertError; 4788 import std.algorithm.comparison : equal; 4789 import std.exception : assertNotThrown, enforce; 4790 import dxml.internal : testRangeFuncs; 4791 4792 static void testPath(R)(R range, string path, EntityType type, string name, size_t line = __LINE__) 4793 { 4794 auto result = assertNotThrown!XMLParsingException(range.skipToPath(path), "unittest 1", __FILE__, line); 4795 enforce!AssertError(!result.empty, "unittest 2", __FILE__, line); 4796 enforce!AssertError(result.front.type == type, "unittest 3", __FILE__, line); 4797 enforce!AssertError(equal(result.front.name, name), "unittest 4", __FILE__, line); 4798 } 4799 4800 static void popEmpty(R)(ref R range) 4801 { 4802 range.popFront(); 4803 static if(range.config.splitEmpty == SplitEmpty.yes) 4804 range.popFront(); 4805 } 4806 4807 auto xml = "<superuser>\n" ~ 4808 " <!-- comment -->\n" ~ 4809 " <?pi?>\n" ~ 4810 " <![CDATA[cdata]]>\n" ~ 4811 " <foo/>\n" ~ 4812 " <bar/>\n" ~ 4813 " <!-- comment -->\n" ~ 4814 " <!-- comment -->\n" ~ 4815 " <baz/>\n" ~ 4816 " <frobozz>\n" ~ 4817 " <!-- comment -->\n" ~ 4818 " <!-- comment -->\n" ~ 4819 " <whatever/>\n" ~ 4820 " <!-- comment -->\n" ~ 4821 " <!-- comment -->\n" ~ 4822 " </frobozz>\n" ~ 4823 " <!-- comment -->\n" ~ 4824 " <!-- comment -->\n" ~ 4825 " <xyzzy/>\n" ~ 4826 "</superuser>"; 4827 4828 static foreach(func; testRangeFuncs) 4829 {{ 4830 auto text = func(xml); 4831 4832 static foreach(config; someTestConfigs) 4833 {{ 4834 static if(config.splitEmpty == SplitEmpty.yes) 4835 enum empty = EntityType.elementStart; 4836 else 4837 enum empty = EntityType.elementEmpty; 4838 4839 auto range = parseXML!config(text.save); 4840 4841 assert(range.save.skipToPath("whatever").empty); 4842 assert(range.save.skipToPath("frobozz/whateve").empty); 4843 4844 testPath(range.save, "foo", empty, "foo"); 4845 testPath(range.save, "bar", empty, "bar"); 4846 testPath(range.save, "baz", empty, "baz"); 4847 testPath(range.save, "frobozz", EntityType.elementStart, "frobozz"); 4848 testPath(range.save, "frobozz/whatever", empty, "whatever"); 4849 testPath(range.save, "xyzzy", empty, "xyzzy"); 4850 4851 range.popFront(); 4852 for(; range.front.type != empty; range.popFront()) 4853 { 4854 assert(range.save.skipToPath("foo").empty); 4855 testPath(range.save, "../foo", empty, "foo"); 4856 testPath(range.save, "../bar", empty, "bar"); 4857 testPath(range.save, "../baz", empty, "baz"); 4858 testPath(range.save, "../frobozz", EntityType.elementStart, "frobozz"); 4859 testPath(range.save, "../frobozz/whatever", empty, "whatever"); 4860 testPath(range.save, "../xyzzy", empty, "xyzzy"); 4861 } 4862 assert(equal(range.front.name, "foo")); 4863 assert(range.save.skipToPath("foo").empty); 4864 assert(range.save.skipToPath("./foo").empty); 4865 assert(range.save.skipToPath("../foo").empty); 4866 assert(range.save.skipToPath("bar").empty); 4867 assert(range.save.skipToPath("baz").empty); 4868 assert(range.save.skipToPath("frobozz").empty); 4869 assert(range.save.skipToPath("whatever").empty); 4870 assert(range.save.skipToPath("../").empty); 4871 assert(range.save.skipToPath("../../").empty); 4872 4873 testPath(range.save, "../bar", empty, "bar"); 4874 testPath(range.save, "../baz", empty, "baz"); 4875 testPath(range.save, "../frobozz", EntityType.elementStart, "frobozz"); 4876 testPath(range.save, "../frobozz/whatever", empty, "whatever"); 4877 testPath(range.save, "../xyzzy", empty, "xyzzy"); 4878 4879 popEmpty(range); 4880 assert(range.save.skipToPath("bar").empty); 4881 testPath(range.save, "../baz", empty, "baz"); 4882 testPath(range.save, "../frobozz", EntityType.elementStart, "frobozz"); 4883 testPath(range.save, "../frobozz/whatever", empty, "whatever"); 4884 testPath(range.save, "../xyzzy", empty, "xyzzy"); 4885 4886 range.popFront(); 4887 for(; range.front.type != empty; range.popFront()) 4888 { 4889 assert(range.save.skipToPath("baz").empty); 4890 testPath(range.save, "../baz", empty, "baz"); 4891 testPath(range.save, "../frobozz", EntityType.elementStart, "frobozz"); 4892 testPath(range.save, "../frobozz/whatever", empty, "whatever"); 4893 testPath(range.save, "../xyzzy", empty, "xyzzy"); 4894 } 4895 assert(equal(range.front.name, "baz")); 4896 4897 testPath(range.save, "../frobozz", EntityType.elementStart, "frobozz"); 4898 testPath(range.save, "../frobozz/whatever", empty, "whatever"); 4899 testPath(range.save, "../xyzzy", empty, "xyzzy"); 4900 4901 popEmpty(range); 4902 assert(equal(range.front.name, "frobozz")); 4903 assert(range.save.skipToPath("wizard").empty); 4904 testPath(range.save, "whatever", empty, "whatever"); 4905 testPath(range.save, "../xyzzy", empty, "xyzzy"); 4906 4907 range.popFront(); 4908 for(; range.front.type != empty; range.popFront()) 4909 { 4910 assert(range.save.skipToPath("whatever").empty); 4911 testPath(range.save, "../whatever", empty, "whatever"); 4912 testPath(range.save, "../../xyzzy", empty, "xyzzy"); 4913 } 4914 assert(equal(range.front.name, "whatever")); 4915 assert(range.save.skipToPath("frobozz").empty); 4916 assert(range.save.skipToPath("../frobozz").empty); 4917 assert(range.save.skipToPath("../xyzzy").empty); 4918 assert(range.save.skipToPath("../../frobozz").empty); 4919 4920 testPath(range.save, "../../xyzzy", empty, "xyzzy"); 4921 4922 popEmpty(range); 4923 for(; range.front.type != EntityType.elementEnd; range.popFront()) 4924 { 4925 assert(range.save.skipToPath("xyzzy").empty); 4926 assert(range.save.skipToPath("../xyzzy").empty); 4927 testPath(range.save, "../../xyzzy", empty, "xyzzy"); 4928 } 4929 assert(equal(range.front.name, "frobozz")); 4930 4931 range.popFront(); 4932 for(; range.front.type != empty; range.popFront()) 4933 { 4934 assert(range.save.skipToPath("xyzzy").empty); 4935 testPath(range.save, "../xyzzy", empty, "xyzzy"); 4936 } 4937 assert(equal(range.front.name, "xyzzy")); 4938 4939 popEmpty(range); 4940 assert(equal(range.front.name, "superuser")); 4941 assert(range.save.skipToPath("superuser").empty); 4942 assert(range.save.skipToPath("foo").empty); 4943 assert(range.save.skipToPath("../foo").empty); 4944 assert(range.save.skipToPath("../../foo").empty); 4945 }} 4946 }} 4947 } 4948 4949 4950 //------------------------------------------------------------------------------ 4951 // Private Section 4952 //------------------------------------------------------------------------------ 4953 private: 4954 4955 4956 auto testParser(Config config = Config.init, R)(R xmlText) @trusted pure nothrow @nogc 4957 { 4958 import std.utf : byCodeUnit; 4959 typeof(EntityRange!(config, R)._text) text; 4960 text.input = byCodeUnit(xmlText); 4961 return text; 4962 } 4963 4964 4965 // toCmpType is to make it easy for tests to convert the expected result to a 4966 // range with the correct element type, since comparing with equal won't do 4967 // the right thing if the result doesn't have dchar as its element type. 4968 auto toCmpType(alias func)(string str) 4969 { 4970 import std.range : takeExactly; 4971 import std.utf : byUTF; 4972 4973 return str.byUTF!(immutable ElementType!(typeof(testParser(func(str)).input.takeExactly(1))))(); 4974 } 4975 4976 auto toCmpType(alias func, ThrowOnEntityRef toer)(string str) 4977 { 4978 import std.range : takeExactly; 4979 import std.utf : byUTF; 4980 4981 return str.byUTF!(immutable ElementType!(typeof(testParser!(makeConfig(toer))(func(str)).input.takeExactly(1))))(); 4982 } 4983 4984 4985 // Used to indicate where in the grammar we're currently parsing. 4986 enum GrammarPos 4987 { 4988 // Nothing has been parsed yet. 4989 documentStart, 4990 4991 // document ::= prolog element Misc* 4992 // prolog ::= XMLDecl? Misc* (doctypedecl Misc*)? 4993 // This is that first Misc*. The next entity to parse is either a Misc, the 4994 // doctypedecl, or the root element which follows the prolog. 4995 prologMisc1, 4996 4997 // document ::= prolog element Misc* 4998 // prolog ::= XMLDecl? Misc* (doctypedecl Misc*) 4999 // This is that second Misc*. The next entity to parse is either a Misc or 5000 // the root element which follows the prolog. 5001 prologMisc2, 5002 5003 // Used with SplitEmpty.yes to tell the parser that we're currently at an 5004 // empty element tag that we're treating as a start tag, so the next entity 5005 // will be an end tag even though we didn't actually parse one. 5006 splittingEmpty, 5007 5008 // element ::= EmptyElemTag | STag content ETag 5009 // content ::= CharData? ((element | Reference | CDSect | PI | Comment) CharData?)* 5010 // This is at the beginning of content at the first CharData?. The next 5011 // thing to parse will be a CharData, element, CDSect, PI, Comment, or ETag. 5012 // References are treated as part of the CharData and not parsed out by the 5013 // EntityRange (see EntityRange.Entity.text). 5014 contentCharData1, 5015 5016 // element ::= EmptyElemTag | STag content ETag 5017 // content ::= CharData? ((element | Reference | CDSect | PI | Comment) CharData?)* 5018 // This is after the first CharData?. The next thing to parse will be a 5019 // element, CDSect, PI, Comment, or ETag. 5020 // References are treated as part of the CharData and not parsed out by the 5021 // EntityRange (see EntityRange.Entity.text). 5022 contentMid, 5023 5024 // element ::= EmptyElemTag | STag content ETag 5025 // content ::= CharData? ((element | Reference | CDSect | PI | Comment) CharData?)* 5026 // This is at the second CharData?. The next thing to parse will be a 5027 // CharData, element, CDSect, PI, Comment, or ETag. 5028 // References are treated as part of the CharData and not parsed out by the 5029 // EntityRange (see EntityRange.Entity.text). 5030 contentCharData2, 5031 5032 // element ::= EmptyElemTag | STag content ETag 5033 // content ::= CharData? ((element | Reference | CDSect | PI | Comment) CharData?)* 5034 // This is after the second CharData?. The next thing to parse is an ETag. 5035 endTag, 5036 5037 // document ::= prolog element Misc* 5038 // This is the Misc* at the end of the document. The next thing to parse is 5039 // either another Misc, or we will hit the end of the document. 5040 endMisc, 5041 5042 // The end of the document (and the grammar) has been reached. 5043 documentEnd 5044 } 5045 5046 5047 // Wrapper around skipOver which takes an EntityParser.Text and handles 5048 // incrementing pos. 5049 // 5050 // It is assumed that there are no newlines. 5051 bool stripStartsWith(Text)(ref Text text, string needle) 5052 { 5053 import std.algorithm.searching : skipOver; 5054 import std.utf : byCodeUnit; 5055 5056 //TODO In the case where we're parsing an array of char, if we can cleanly 5057 // strip off any byCodeUnit and takeExactly wrappers, then we should be able 5058 // to have skipOver compare the string being parsed and the needle with ==. 5059 // It may happen in some cases right now when text.input is a byCodeUnit 5060 // result, but it won't happen in all cases where it ideally would. We may 5061 // also want to look into using byUTF on the needle so that it matches the 5062 // encoding of text.input or even make needle match the encoding when it's 5063 // passed in instead of always being string. 5064 if(!text.input.skipOver(needle.byCodeUnit())) 5065 return false; 5066 5067 text.pos.col += needle.length; 5068 5069 return true; 5070 } 5071 5072 unittest 5073 { 5074 import core.exception : AssertError; 5075 import std.exception : enforce; 5076 import dxml.internal : equalCU, testRangeFuncs; 5077 5078 static void test(alias func)(string origHaystack, string needle, string remainder, bool startsWith, 5079 int row, int col, size_t line = __LINE__) 5080 { 5081 auto haystack = func(origHaystack); 5082 { 5083 auto text = testParser(haystack.save); 5084 enforce!AssertError(text.stripStartsWith(needle) == startsWith, "unittest failure 1", __FILE__, line); 5085 enforce!AssertError(equalCU(text.input, remainder), "unittest failure 2", __FILE__, line); 5086 enforce!AssertError(text.pos == TextPos(row, col), "unittest failure 3", __FILE__, line); 5087 } 5088 { 5089 auto pos = TextPos(row + 3, row == 1 ? col + 7 : col); 5090 auto text = testParser(haystack); 5091 text.pos.line += 3; 5092 text.pos.col += 7; 5093 enforce!AssertError(text.stripStartsWith(needle) == startsWith, "unittest failure 4", __FILE__, line); 5094 enforce!AssertError(equalCU(text.input, remainder), "unittest failure 5", __FILE__, line); 5095 enforce!AssertError(text.pos == pos, "unittest failure 6", __FILE__, line); 5096 } 5097 } 5098 5099 static foreach(func; testRangeFuncs) 5100 { 5101 test!func("hello world", "hello", " world", true, 1, "hello".length + 1); 5102 test!func("hello world", "hello world", "", true, 1, "hello world".length + 1); 5103 test!func("hello world", "foo", "hello world", false, 1, 1); 5104 test!func("hello world", "hello sally", "hello world", false, 1, 1); 5105 test!func("hello world", "hello world ", "hello world", false, 1, 1); 5106 } 5107 } 5108 5109 @safe pure unittest 5110 { 5111 import std.algorithm.comparison : equal; 5112 import dxml.internal : testRangeFuncs; 5113 5114 static foreach(func; testRangeFuncs) 5115 {{ 5116 auto xml = func(`foo`); 5117 auto text = testParser!simpleXML(xml); 5118 assert(text.stripStartsWith("fo")); 5119 }} 5120 } 5121 5122 5123 // Strips whitespace while dealing with text.pos accordingly. Newlines are not 5124 // ignored. 5125 // Returns whether any whitespace was stripped. 5126 bool stripWS(Text)(ref Text text) 5127 { 5128 bool strippedSpace = false; 5129 5130 static if(hasLength!(Text.Input)) 5131 size_t lineStart = text.input.length; 5132 5133 loop: while(!text.input.empty) 5134 { 5135 switch(text.input.front) 5136 { 5137 case ' ': 5138 case '\t': 5139 case '\r': 5140 { 5141 strippedSpace = true; 5142 text.input.popFront(); 5143 static if(!hasLength!(Text.Input)) 5144 ++text.pos.col; 5145 break; 5146 } 5147 case '\n': 5148 { 5149 strippedSpace = true; 5150 text.input.popFront(); 5151 static if(hasLength!(Text.Input)) 5152 lineStart = text.input.length; 5153 nextLine!(Text.config)(text.pos); 5154 break; 5155 } 5156 default: break loop; 5157 } 5158 } 5159 5160 static if(hasLength!(Text.Input)) 5161 text.pos.col += lineStart - text.input.length; 5162 5163 return strippedSpace; 5164 } 5165 5166 unittest 5167 { 5168 import core.exception : AssertError; 5169 import std.exception : enforce; 5170 import dxml.internal : equalCU; 5171 import dxml.internal : testRangeFuncs; 5172 5173 static void test(alias func)(string origHaystack, string remainder, bool stripped, 5174 int row, int col, size_t line = __LINE__) 5175 { 5176 auto haystack = func(origHaystack); 5177 { 5178 auto text = testParser(haystack.save); 5179 enforce!AssertError(text.stripWS() == stripped, "unittest failure 1", __FILE__, line); 5180 enforce!AssertError(equalCU(text.input, remainder), "unittest failure 2", __FILE__, line); 5181 enforce!AssertError(text.pos == TextPos(row, col), "unittest failure 3", __FILE__, line); 5182 } 5183 { 5184 auto pos = TextPos(row + 3, row == 1 ? col + 7 : col); 5185 auto text = testParser(haystack); 5186 text.pos.line += 3; 5187 text.pos.col += 7; 5188 enforce!AssertError(text.stripWS() == stripped, "unittest failure 4", __FILE__, line); 5189 enforce!AssertError(equalCU(text.input, remainder), "unittest failure 5", __FILE__, line); 5190 enforce!AssertError(text.pos == pos, "unittest failure 6", __FILE__, line); 5191 } 5192 } 5193 5194 static foreach(func; testRangeFuncs) 5195 { 5196 test!func(" \t\rhello world", "hello world", true, 1, 5); 5197 test!func(" \n \n \n \nhello world", "hello world", true, 5, 1); 5198 test!func(" \n \n \n \n hello world", "hello world", true, 5, 3); 5199 test!func("hello world", "hello world", false, 1, 1); 5200 } 5201 } 5202 5203 @safe pure unittest 5204 { 5205 import dxml.internal : testRangeFuncs; 5206 5207 static foreach(func; testRangeFuncs) 5208 {{ 5209 auto xml = func(`foo`); 5210 auto text = testParser!simpleXML(xml); 5211 assert(!text.stripWS()); 5212 }} 5213 } 5214 5215 5216 // Returns a slice (or takeExactly) of text.input up to but not including the 5217 // given needle, removing both that slice and the given needle from text.input 5218 // in the process. If the needle is not found, then an XMLParsingException is 5219 // thrown. 5220 auto takeUntilAndDrop(string needle, bool skipQuotedText = false, Text)(ref Text text) 5221 { 5222 return _takeUntil!(true, needle, skipQuotedText, Text)(text); 5223 } 5224 5225 unittest 5226 { 5227 import core.exception : AssertError; 5228 import std.algorithm.comparison : equal; 5229 import std.exception : collectException, enforce; 5230 import dxml.internal : codeLen, testRangeFuncs; 5231 5232 static void test(alias func, string needle, bool sqt)(string origHaystack, string expected, string remainder, 5233 int row, int col, size_t line = __LINE__) 5234 { 5235 auto haystack = func(origHaystack); 5236 auto adjExpected = expected.toCmpType!func(); 5237 { 5238 auto text = testParser(haystack.save); 5239 auto temp = text.save; 5240 enforce!AssertError(equal(text.takeUntilAndDrop!(needle, sqt)(), adjExpected.save), 5241 "unittest failure 1", __FILE__, line); 5242 enforce!AssertError(equal(text.input, remainder), "unittest failure 2", __FILE__, line); 5243 enforce!AssertError(text.pos == TextPos(row, col), "unittest failure 3", __FILE__, line); 5244 } 5245 { 5246 auto pos = TextPos(row + 3, row == 1 ? col + 7 : col); 5247 auto text = testParser(haystack); 5248 text.pos.line += 3; 5249 text.pos.col += 7; 5250 enforce!AssertError(equal(text.takeUntilAndDrop!(needle, sqt)(), adjExpected), 5251 "unittest failure 4", __FILE__, line); 5252 enforce!AssertError(equal(text.input, remainder), "unittest failure 5", __FILE__, line); 5253 enforce!AssertError(text.pos == pos, "unittest failure 6", __FILE__, line); 5254 } 5255 } 5256 5257 static void testFail(alias func, string needle, bool sqt) 5258 (string origHaystack, int row, int col, size_t line = __LINE__) 5259 { 5260 auto haystack = func(origHaystack); 5261 { 5262 auto text = testParser(haystack.save); 5263 auto e = collectException!XMLParsingException(text.takeUntilAndDrop!(needle, sqt)()); 5264 enforce!AssertError(e !is null, "unittest failure 1", __FILE__, line); 5265 enforce!AssertError(e.pos == TextPos(row, col), "unittest failure 2", __FILE__, line); 5266 } 5267 { 5268 auto pos = TextPos(row + 3, row == 1 ? col + 7 : col); 5269 auto text = testParser(haystack); 5270 text.pos.line += 3; 5271 text.pos.col += 7; 5272 auto e = collectException!XMLParsingException(text.takeUntilAndDrop!(needle, sqt)()); 5273 enforce!AssertError(e !is null, "unittest failure 3", __FILE__, line); 5274 enforce!AssertError(e.pos == pos, "unittest failure 4", __FILE__, line); 5275 } 5276 } 5277 5278 static foreach(func; testRangeFuncs) 5279 { 5280 static foreach(sqt; [false, true]) 5281 { 5282 { 5283 auto haystack = "hello world"; 5284 enum needle = "world"; 5285 5286 static foreach(i; 1 .. needle.length) 5287 test!(func, needle[0 .. i], sqt)(haystack, "hello ", needle[i .. $], 1, 7 + i); 5288 } 5289 5290 test!(func, "l", sqt)("lello world", "", "ello world", 1, 2); 5291 test!(func, "ll", sqt)("lello world", "le", "o world", 1, 5); 5292 test!(func, "le", sqt)("llello world", "l", "llo world", 1, 4); 5293 { 5294 enum needle = "great"; 5295 enum expected = "プログラミング in D is "; 5296 static foreach(i; 1 .. needle.length) 5297 { 5298 test!(func, needle[0 .. i], sqt)("プログラミング in D is great indeed", expected, 5299 "great indeed"[i .. $], 1, codeLen!(func, expected) + i + 1); 5300 } 5301 } 5302 static foreach(haystack; ["", "a", "hello", "ディラン"]) 5303 testFail!(func, "x", sqt)(haystack, 1, 1); 5304 static foreach(haystack; ["", "l", "lte", "world", "nomatch"]) 5305 testFail!(func, "le", sqt)(haystack, 1, 1); 5306 static foreach(haystack; ["", "w", "we", "wew", "bwe", "we b", "hello we go", "nomatch"]) 5307 testFail!(func, "web", sqt)(haystack, 1, 1); 5308 } 5309 5310 test!(func, "*", false)(`hello '*' "*" * world`, `hello '`, `' "*" * world`, 1, 9); 5311 test!(func, "*", false)(`hello '"*' * world`, `hello '"`, `' * world`, 1, 10); 5312 test!(func, "*", false)(`hello "'*" * world`, `hello "'`, `" * world`, 1, 10); 5313 test!(func, "*", false)(`hello ''' * world`, `hello ''' `, ` world`, 1, 12); 5314 test!(func, "*", false)(`hello """ * world`, `hello """ `, ` world`, 1, 12); 5315 testFail!(func, "*", false)("foo\n\n ' \n\nbar", 1, 1); 5316 testFail!(func, "*", false)(`ディラン " `, 1, 1); 5317 5318 test!(func, "*", true)(`hello '*' "*" * world`, `hello '*' "*" `, ` world`, 1, 16); 5319 test!(func, "*", true)(`hello '"*' * world`, `hello '"*' `, ` world`, 1, 13); 5320 test!(func, "*", true)(`hello "'*" * world`, `hello "'*" `, ` world`, 1, 13); 5321 testFail!(func, "*", true)(`hello ''' * world`, 1, 9); 5322 testFail!(func, "*", true)(`hello """ * world`, 1, 9); 5323 testFail!(func, "*", true)("foo\n\n ' \n\nbar", 3, 4); 5324 testFail!(func, "*", true)(`ディラン " `, 1, codeLen!(func, `ディラン "`)); 5325 5326 test!(func, "*", true)(`hello '' "" * world`, `hello '' "" `, ` world`, 1, 14); 5327 test!(func, "*", true)("foo '\n \n \n' bar*", "foo '\n \n \n' bar", "", 4, 7); 5328 } 5329 } 5330 5331 @safe pure unittest 5332 { 5333 import std.algorithm.comparison : equal; 5334 import dxml.internal : testRangeFuncs; 5335 5336 static foreach(func; testRangeFuncs) 5337 {{ 5338 auto xml = func(`foo`); 5339 auto text = testParser!simpleXML(xml); 5340 assert(equal(text.takeUntilAndDrop!"o"(), "f")); 5341 }} 5342 } 5343 5344 // Variant of takeUntilAndDrop which does not return a slice. It's intended for 5345 // when the config indicates that something should be skipped. 5346 void skipUntilAndDrop(string needle, bool skipQuotedText = false, Text)(ref Text text) 5347 { 5348 _takeUntil!(false, needle, skipQuotedText, Text)(text); 5349 } 5350 5351 unittest 5352 { 5353 import core.exception : AssertError; 5354 import std.algorithm.comparison : equal; 5355 import std.exception : assertNotThrown, collectException, enforce; 5356 import dxml.internal : codeLen, testRangeFuncs; 5357 5358 static void test(alias func, string needle, bool sqt)(string origHaystack, string remainder, 5359 int row, int col, size_t line = __LINE__) 5360 { 5361 auto haystack = func(origHaystack); 5362 { 5363 auto text = testParser(haystack.save); 5364 assertNotThrown!XMLParsingException(text.skipUntilAndDrop!(needle, sqt)(), "unittest failure 1", 5365 __FILE__, line); 5366 enforce!AssertError(equal(text.input, remainder), "unittest failure 2", __FILE__, line); 5367 enforce!AssertError(text.pos == TextPos(row, col), "unittest failure 3", __FILE__, line); 5368 } 5369 { 5370 auto pos = TextPos(row + 3, row == 1 ? col + 7 : col); 5371 auto text = testParser(haystack); 5372 text.pos.line += 3; 5373 text.pos.col += 7; 5374 assertNotThrown!XMLParsingException(text.skipUntilAndDrop!(needle, sqt)(), "unittest failure 4", 5375 __FILE__, line); 5376 enforce!AssertError(equal(text.input, remainder), "unittest failure 5", __FILE__, line); 5377 enforce!AssertError(text.pos == pos, "unittest failure 6", __FILE__, line); 5378 } 5379 } 5380 5381 static void testFail(alias func, string needle, bool sqt) 5382 (string origHaystack, int row, int col, size_t line = __LINE__) 5383 { 5384 auto haystack = func(origHaystack); 5385 { 5386 auto text = testParser(haystack.save); 5387 auto e = collectException!XMLParsingException(text.skipUntilAndDrop!(needle, sqt)()); 5388 enforce!AssertError(e !is null, "unittest failure 1", __FILE__, line); 5389 enforce!AssertError(e.pos == TextPos(row, col), "unittest failure 2", __FILE__, line); 5390 } 5391 { 5392 auto pos = TextPos(row + 3, row == 1 ? col + 7 : col); 5393 auto text = testParser(haystack); 5394 text.pos.line += 3; 5395 text.pos.col += 7; 5396 auto e = collectException!XMLParsingException(text.skipUntilAndDrop!(needle, sqt)()); 5397 enforce!AssertError(e !is null, "unittest failure 3", __FILE__, line); 5398 enforce!AssertError(e.pos == pos, "unittest failure 4", __FILE__, line); 5399 } 5400 } 5401 5402 static foreach(func; testRangeFuncs) 5403 { 5404 static foreach(sqt; [false, true]) 5405 { 5406 { 5407 enum needle = "world"; 5408 static foreach(i; 1 .. needle.length) 5409 test!(func, needle[0 .. i], sqt)("hello world", needle[i .. $], 1, 7 + i); 5410 } 5411 5412 test!(func, "l", sqt)("lello world", "ello world", 1, 2); 5413 test!(func, "ll", sqt)("lello world", "o world", 1, 5); 5414 test!(func, "le", sqt)("llello world", "llo world", 1, 4); 5415 5416 { 5417 enum needle = "great"; 5418 static foreach(i; 1 .. needle.length) 5419 { 5420 test!(func, needle[0 .. i], sqt)("プログラミング in D is great indeed", "great indeed"[i .. $], 5421 1, codeLen!(func, "プログラミング in D is ") + i + 1); 5422 } 5423 } 5424 5425 static foreach(haystack; ["", "a", "hello", "ディラン"]) 5426 testFail!(func, "x", sqt)(haystack, 1, 1); 5427 static foreach(haystack; ["", "l", "lte", "world", "nomatch"]) 5428 testFail!(func, "le", sqt)(haystack, 1, 1); 5429 static foreach(haystack; ["", "w", "we", "wew", "bwe", "we b", "hello we go", "nomatch"]) 5430 testFail!(func, "web", sqt)(haystack, 1, 1); 5431 } 5432 5433 test!(func, "*", false)(`hello '*' "*" * world`, `' "*" * world`, 1, 9); 5434 test!(func, "*", false)(`hello '"*' * world`, `' * world`, 1, 10); 5435 test!(func, "*", false)(`hello "'*" * world`, `" * world`, 1, 10); 5436 test!(func, "*", false)(`hello ''' * world`, ` world`, 1, 12); 5437 test!(func, "*", false)(`hello """ * world`, ` world`, 1, 12); 5438 testFail!(func, "*", false)("foo\n\n ' \n\nbar", 1, 1); 5439 testFail!(func, "*", false)(`ディラン " `, 1, 1); 5440 5441 test!(func, "*", true)(`hello '*' "*" * world`, ` world`, 1, 16); 5442 test!(func, "*", true)(`hello '"*' * world`, ` world`, 1, 13); 5443 test!(func, "*", true)(`hello "'*" * world`, ` world`, 1, 13); 5444 testFail!(func, "*", true)(`hello ''' * world`, 1, 9); 5445 testFail!(func, "*", true)(`hello """ * world`, 1, 9); 5446 testFail!(func, "*", true)("foo\n\n ' \n\nbar", 3, 4); 5447 testFail!(func, "*", true)(`ディラン " `, 1, codeLen!(func, `ディラン "`)); 5448 5449 test!(func, "*", true)(`hello '' "" * world`, ` world`, 1, 14); 5450 test!(func, "*", true)("foo '\n \n \n' bar*", "", 4, 7); 5451 } 5452 } 5453 5454 @safe pure unittest 5455 { 5456 import std.algorithm.comparison : equal; 5457 import dxml.internal : testRangeFuncs; 5458 5459 static foreach(func; testRangeFuncs) 5460 {{ 5461 auto xml = func(`foo`); 5462 auto text = testParser!simpleXML(xml); 5463 text.skipUntilAndDrop!"o"(); 5464 assert(equal(text.input, "o")); 5465 }} 5466 } 5467 5468 auto _takeUntil(bool retSlice, string needle, bool skipQuotedText, Text)(ref Text text) 5469 { 5470 import std.algorithm : find; 5471 import std.ascii : isWhite; 5472 import std.range : takeExactly; 5473 5474 static assert(needle.find!isWhite().empty); 5475 5476 auto orig = text.save; 5477 bool found = false; 5478 size_t takeLen = 0; 5479 size_t lineStart = 0; 5480 5481 void processNewline() 5482 { 5483 ++takeLen; 5484 nextLine!(Text.config)(text.pos); 5485 lineStart = takeLen; 5486 } 5487 5488 loop: while(!text.input.empty) 5489 { 5490 switch(text.input.front) 5491 { 5492 case cast(ElementType!(Text.Input))needle[0]: 5493 { 5494 static if(needle.length == 1) 5495 { 5496 found = true; 5497 text.input.popFront(); 5498 break loop; 5499 } 5500 else static if(needle.length == 2) 5501 { 5502 text.input.popFront(); 5503 if(!text.input.empty && text.input.front == needle[1]) 5504 { 5505 found = true; 5506 text.input.popFront(); 5507 break loop; 5508 } 5509 ++takeLen; 5510 continue; 5511 } 5512 else 5513 { 5514 text.input.popFront(); 5515 auto saved = text.input.save; 5516 foreach(i, c; needle[1 .. $]) 5517 { 5518 if(text.input.empty) 5519 { 5520 takeLen += i + 1; 5521 break loop; 5522 } 5523 if(text.input.front != c) 5524 { 5525 text.input = saved; 5526 ++takeLen; 5527 continue loop; 5528 } 5529 text.input.popFront(); 5530 } 5531 found = true; 5532 break loop; 5533 } 5534 } 5535 static if(skipQuotedText) 5536 { 5537 static foreach(quote; ['\'', '"']) 5538 { 5539 case quote: 5540 { 5541 auto quotePos = text.pos; 5542 quotePos.col += takeLen - lineStart; 5543 ++takeLen; 5544 while(true) 5545 { 5546 text.input.popFront(); 5547 if(text.input.empty) 5548 throw new XMLParsingException("Failed to find matching quote", quotePos); 5549 switch(text.input.front) 5550 { 5551 case quote: 5552 { 5553 ++takeLen; 5554 text.input.popFront(); 5555 continue loop; 5556 } 5557 case '\n': 5558 { 5559 processNewline(); 5560 break; 5561 } 5562 default: 5563 { 5564 ++takeLen; 5565 break; 5566 } 5567 } 5568 } 5569 assert(0); // the compiler isn't smart enough to see that this is unreachable. 5570 } 5571 } 5572 } 5573 case '\n': 5574 { 5575 processNewline(); 5576 break; 5577 } 5578 default: 5579 { 5580 ++takeLen; 5581 break; 5582 } 5583 } 5584 5585 text.input.popFront(); 5586 } 5587 5588 text.pos.col += takeLen - lineStart + needle.length; 5589 5590 if(!found) 5591 throw new XMLParsingException("Failed to find: " ~ needle, orig.pos); 5592 5593 static if(retSlice) 5594 return takeExactly(orig.input, takeLen); 5595 } 5596 5597 5598 // Okay, this name kind of sucks, because it's too close to skipUntilAndDrop, 5599 // but I'd rather do this than be passing template arguments to choose between 5600 // behaviors - especially when the logic is so different. It skips until it 5601 // reaches one of the delimiter characters. If it finds one of them, then the 5602 // first character in the input is the delimiter that was found, and if it 5603 // doesn't find either, then it throws. 5604 template skipToOneOf(delims...) 5605 { 5606 static foreach(delim; delims) 5607 { 5608 static assert(is(typeof(delim) == char)); 5609 static assert(!isSpace(delim)); 5610 } 5611 5612 void skipToOneOf(Text)(ref Text text) 5613 { 5614 while(!text.input.empty) 5615 { 5616 switch(text.input.front) 5617 { 5618 static foreach(delim; delims) 5619 case delim: return; 5620 case '\n': 5621 { 5622 nextLine!(Text.config)(text.pos); 5623 text.input.popFront(); 5624 break; 5625 } 5626 default: 5627 { 5628 popFrontAndIncCol(text); 5629 break; 5630 } 5631 } 5632 } 5633 throw new XMLParsingException("Prematurely reached end of document", text.pos); 5634 } 5635 } 5636 5637 unittest 5638 { 5639 import core.exception : AssertError; 5640 import std.algorithm.comparison : equal; 5641 import std.exception : assertNotThrown, collectException, enforce; 5642 import dxml.internal : codeLen, testRangeFuncs; 5643 5644 static void test(alias func, delims...)(string origHaystack, string remainder, 5645 int row, int col, size_t line = __LINE__) 5646 { 5647 auto haystack = func(origHaystack); 5648 { 5649 auto text = testParser(haystack.save); 5650 assertNotThrown!XMLParsingException(text.skipToOneOf!delims(), "unittest 1", __FILE__, line); 5651 enforce!AssertError(equal(text.input, remainder), "unittest failure 2", __FILE__, line); 5652 enforce!AssertError(text.pos == TextPos(row, col), "unittest failure 3", __FILE__, line); 5653 } 5654 { 5655 auto pos = TextPos(row + 3, row == 1 ? col + 7 : col); 5656 auto text = testParser(haystack); 5657 text.pos.line += 3; 5658 text.pos.col += 7; 5659 assertNotThrown!XMLParsingException(text.skipToOneOf!delims(), "unittest 4", __FILE__, line); 5660 enforce!AssertError(equal(text.input, remainder), "unittest failure 5", __FILE__, line); 5661 enforce!AssertError(text.pos == pos, "unittest failure 6", __FILE__, line); 5662 } 5663 } 5664 5665 static void testFail(alias func, delims...)(string origHaystack, int row, int col, size_t line = __LINE__) 5666 { 5667 auto haystack = func(origHaystack); 5668 { 5669 auto text = testParser(haystack.save); 5670 auto e = collectException!XMLParsingException(text.skipToOneOf!delims()); 5671 enforce!AssertError(e !is null, "unittest failure 1", __FILE__, line); 5672 enforce!AssertError(e.pos == TextPos(row, col), "unittest failure 2", __FILE__, line); 5673 } 5674 { 5675 auto pos = TextPos(row + 3, row == 1 ? col + 7 : col); 5676 auto text = testParser(haystack); 5677 text.pos.line += 3; 5678 text.pos.col += 7; 5679 auto e = collectException!XMLParsingException(text.skipToOneOf!delims()); 5680 enforce!AssertError(e !is null, "unittest failure 3", __FILE__, line); 5681 enforce!AssertError(e.pos == pos, "unittest failure 4", __FILE__, line); 5682 } 5683 } 5684 5685 static foreach(func; testRangeFuncs) 5686 { 5687 test!(func, 'o', 'w')("hello world", "o world", 1, 5); 5688 test!(func, 'r', 'w', '1', '+', '*')("hello world", "world", 1, 7); 5689 test!(func, 'z', 'y')("abc\n\n\n \n\n wxyzzy \nf\ng", "yzzy \nf\ng", 6, 6); 5690 test!(func, 'o', 'g')("abc\n\n\n \n\n wxyzzy \nf\ng", "g", 8, 1); 5691 test!(func, 'g', 'x')("プログラミング in D is great indeed", "great indeed", 5692 1, codeLen!(func, "プログラミング in D is ") + 1); 5693 5694 testFail!(func, 'a', 'b')("hello world", 1, 12); 5695 testFail!(func, 'a', 'b')("hello\n\nworld", 3, 6); 5696 testFail!(func, 'a', 'b')("プログラミング", 1, codeLen!(func, "プログラミング") + 1); 5697 } 5698 } 5699 5700 @safe pure unittest 5701 { 5702 import std.algorithm.comparison : equal; 5703 import dxml.internal : testRangeFuncs; 5704 5705 static foreach(func; testRangeFuncs) 5706 {{ 5707 auto xml = func(`foo`); 5708 auto text = testParser!simpleXML(xml); 5709 text.skipToOneOf!('o')(); 5710 assert(equal(text.input, "oo")); 5711 }} 5712 } 5713 5714 5715 // The front of the input should be text surrounded by single or double quotes. 5716 // This returns a slice of the input containing that text, and the input is 5717 // advanced to one code unit beyond the quote. 5718 auto takeEnquotedText(Text)(ref Text text) 5719 { 5720 checkNotEmpty(text); 5721 immutable quote = text.input.front; 5722 static foreach(quoteChar; [`"`, `'`]) 5723 { 5724 // This would be a bit simpler if takeUntilAndDrop took a runtime 5725 // argument, but in all other cases, a compile-time argument makes more 5726 // sense, so this seemed like a reasonable way to handle this one case. 5727 if(quote == quoteChar[0]) 5728 { 5729 popFrontAndIncCol(text); 5730 return takeUntilAndDrop!quoteChar(text); 5731 } 5732 } 5733 throw new XMLParsingException("Expected quoted text", text.pos); 5734 } 5735 5736 unittest 5737 { 5738 import core.exception : AssertError; 5739 import std.algorithm.comparison : equal; 5740 import std.exception : assertThrown, enforce; 5741 import std.range : only; 5742 import dxml.internal : testRangeFuncs; 5743 5744 static void test(alias func)(string origHaystack, string expected, string remainder, 5745 int row, int col, size_t line = __LINE__) 5746 { 5747 auto haystack = func(origHaystack); 5748 auto adjExpected = expected.toCmpType!func(); 5749 { 5750 auto text = testParser(haystack.save); 5751 enforce!AssertError(equal(takeEnquotedText(text), adjExpected.save), "unittest failure 1", __FILE__, line); 5752 enforce!AssertError(equal(text.input, remainder), "unittest failure 2", __FILE__, line); 5753 enforce!AssertError(text.pos == TextPos(row, col), "unittest failure 3", __FILE__, line); 5754 } 5755 { 5756 auto pos = TextPos(row + 3, row == 1 ? col + 7 : col); 5757 auto text = testParser(haystack); 5758 text.pos.line += 3; 5759 text.pos.col += 7; 5760 enforce!AssertError(equal(takeEnquotedText(text), adjExpected), "unittest failure 3", __FILE__, line); 5761 enforce!AssertError(equal(text.input, remainder), "unittest failure 4", __FILE__, line); 5762 enforce!AssertError(text.pos == pos, "unittest failure 3", __FILE__, line); 5763 } 5764 } 5765 5766 static void testFail(alias func)(string origHaystack, size_t line = __LINE__) 5767 { 5768 auto haystack = func(origHaystack); 5769 auto text = testParser(haystack); 5770 assertThrown!XMLParsingException(text.takeEnquotedText(), "unittest failure", __FILE__, line); 5771 } 5772 5773 static foreach(func; testRangeFuncs) 5774 { 5775 foreach(quote; only("\"", "'")) 5776 { 5777 test!func(quote ~ quote, "", "", 1, 3); 5778 test!func(quote ~ "hello world" ~ quote, "hello world", "", 1, 14); 5779 test!func(quote ~ "hello world" ~ quote ~ " foo", "hello world", " foo", 1, 14); 5780 { 5781 import std.utf : codeLength; 5782 auto haystack = quote ~ "プログラミング " ~ quote ~ "in D"; 5783 enum len = cast(int)codeLength!(ElementEncodingType!(typeof(func(haystack))))("プログラミング "); 5784 test!func(haystack, "プログラミング ", "in D", 1, len + 3); 5785 } 5786 } 5787 5788 foreach(str; only(`hello`, `"hello'`, `"hello`, `'hello"`, `'hello`, ``, `"'`, `"`, `'"`, `'`)) 5789 testFail!func(str); 5790 } 5791 } 5792 5793 5794 // This removes a name per the Name grammar rule from the front of the input and 5795 // returns it. 5796 // The parsing continues until either one of the given delimiters or an XML 5797 // whitespace character is encountered. The delimiter/whitespace is not returned 5798 // as part of the name and is left at the front of the input. 5799 template takeName(delims...) 5800 { 5801 static foreach(delim; delims) 5802 { 5803 static assert(is(typeof(delim) == char), delim); 5804 static assert(!isSpace(delim)); 5805 } 5806 5807 auto takeName(Text)(ref Text text) 5808 { 5809 import std.format : format; 5810 import std.range : takeExactly; 5811 import std.utf : decodeFront, UseReplacementDchar; 5812 import dxml.internal : formatInvalidCharMsg, isNameStartChar, isNameChar; 5813 5814 assert(!text.input.empty); 5815 5816 auto orig = text.input.save; 5817 size_t takeLen; 5818 { 5819 immutable decodedC = text.input.decodeFront!(UseReplacementDchar.yes)(takeLen); 5820 if(!isNameStartChar(decodedC)) 5821 throw new XMLParsingException(formatInvalidCharMsg!"Name contains invalid character: %s"(decodedC), text.pos); 5822 } 5823 5824 if(text.input.empty) 5825 { 5826 text.pos.col += takeLen; 5827 return takeExactly(orig, takeLen); 5828 } 5829 5830 loop: while(true) 5831 { 5832 immutable c = text.input.front; 5833 if(isSpace(c)) 5834 break; 5835 static foreach(delim; delims) 5836 { 5837 if(c == delim) 5838 break loop; 5839 } 5840 5841 size_t numCodeUnits; 5842 immutable decodedC = text.input.decodeFront!(UseReplacementDchar.yes)(numCodeUnits); 5843 if(!isNameChar(decodedC)) 5844 { 5845 text.pos.col += takeLen; 5846 throw new XMLParsingException(formatInvalidCharMsg!"Name contains invalid character: %s"(decodedC), text.pos); 5847 } 5848 takeLen += numCodeUnits; 5849 5850 if(text.input.empty) 5851 break; 5852 } 5853 5854 text.pos.col += takeLen; 5855 5856 return takeExactly(orig, takeLen); 5857 } 5858 } 5859 5860 unittest 5861 { 5862 import core.exception : AssertError; 5863 import std.algorithm.comparison : equal; 5864 import std.exception : collectException, enforce; 5865 import std.typecons : tuple; 5866 import dxml.internal : codeLen, testRangeFuncs; 5867 5868 static void test(alias func, delim...)(string origHaystack, string expected, string remainder, 5869 int row, int col, size_t line = __LINE__) 5870 { 5871 auto haystack = func(origHaystack); 5872 auto adjExpected = expected.toCmpType!func(); 5873 { 5874 auto text = testParser(haystack.save); 5875 enforce!AssertError(equal(text.takeName!delim(), adjExpected.save), 5876 "unittest failure 1", __FILE__, line); 5877 enforce!AssertError(equal(text.input, remainder), "unittest failure 2", __FILE__, line); 5878 enforce!AssertError(text.pos == TextPos(row, col), "unittest failure 3", __FILE__, line); 5879 } 5880 { 5881 auto pos = TextPos(row + 3, row == 1 ? col + 7 : col); 5882 auto text = testParser(haystack); 5883 text.pos.line += 3; 5884 text.pos.col += 7; 5885 enforce!AssertError(equal(text.takeName!delim(), adjExpected), 5886 "unittest failure 4", __FILE__, line); 5887 enforce!AssertError(equal(text.input, remainder), "unittest failure 5", __FILE__, line); 5888 enforce!AssertError(text.pos == pos, "unittest failure 6", __FILE__, line); 5889 } 5890 } 5891 5892 static void testFail(alias func, delim...)(string origHaystack, int row, int col, size_t line = __LINE__) 5893 { 5894 auto haystack = func(origHaystack); 5895 { 5896 auto text = testParser(haystack.save); 5897 auto e = collectException!XMLParsingException(text.takeName!delim()); 5898 enforce!AssertError(e !is null, "unittest failure 1", __FILE__, line); 5899 enforce!AssertError(e.pos == TextPos(row, col), "unittest failure 2", __FILE__, line); 5900 } 5901 { 5902 auto pos = TextPos(row + 3, row == 1 ? col + 7 : col); 5903 auto text = testParser(haystack); 5904 text.pos.line += 3; 5905 text.pos.col += 7; 5906 auto e = collectException!XMLParsingException(text.takeName!delim()); 5907 enforce!AssertError(e !is null, "unittest failure 3", __FILE__, line); 5908 enforce!AssertError(e.pos == pos, "unittest failure 4", __FILE__, line); 5909 } 5910 } 5911 5912 static foreach(func; testRangeFuncs) 5913 { 5914 static foreach(str; ["hello", "プログラミング", "h_:llo-.42", "_.", "_-", "_42"]) 5915 {{ 5916 enum len = codeLen!(func, str); 5917 5918 static foreach(remainder; ["", " ", "\t", "\r", "\n", " foo", "\tfoo", "\rfoo", "\nfoo", " foo \n \r "]) 5919 {{ 5920 enum strRem = str ~ remainder; 5921 enum delimRem = '>' ~ remainder; 5922 enum hay = str ~ delimRem; 5923 test!func(strRem, str, remainder, 1, len + 1); 5924 test!(func, '=')(strRem, str, remainder, 1, len + 1); 5925 test!(func, '>', '|')(hay, str, delimRem, 1, len + 1); 5926 test!(func, '|', '>')(hay, str, delimRem, 1, len + 1); 5927 }} 5928 }} 5929 5930 static foreach(t; [tuple(" ", 1, 1), tuple("<", 1, 1), tuple("foo!", 1, 4), tuple("foo!<", 1, 4)]) 5931 {{ 5932 testFail!func(t[0], t[1], t[2]); 5933 testFail!func(t[0] ~ '>', t[1], t[2]); 5934 testFail!(func, '?')(t[0], t[1], t[2]); 5935 testFail!(func, '=')(t[0] ~ '=', t[1], t[2]); 5936 }} 5937 5938 testFail!(func, '>')(">", 1, 1); 5939 testFail!(func, '?')("?", 1, 1); 5940 testFail!(func, '?')("プログ&ラミング", 1, codeLen!(func, "プログ&")); 5941 5942 static foreach(t; [tuple("42", 1, 1), tuple(".", 1, 1), tuple(".a", 1, 1)]) 5943 { 5944 testFail!func(t[0], t[1], t[2]); 5945 testFail!(func, '>')(t[0], t[1], t[2]); 5946 } 5947 } 5948 } 5949 5950 @safe pure unittest 5951 { 5952 import std.algorithm.comparison : equal; 5953 import dxml.internal : testRangeFuncs; 5954 5955 static foreach(func; testRangeFuncs) 5956 {{ 5957 auto xml = func(`foo`); 5958 auto text = testParser!simpleXML(xml); 5959 assert(equal(text.takeName(), "foo")); 5960 }} 5961 } 5962 5963 5964 // This removes an attribute value from the front of the input, partially 5965 // validates it, and returns it. The validation that is not done is whether 5966 // the value in a character reference is valid. It's checked for whether the 5967 // characters used in it are valid but not whether the number they form is a 5968 // valid Unicode character. Checking the number doesn't seem worth the extra 5969 // complication, and it's not required for the XML to be "well-formed." 5970 // dxml.util.parseCharRef will check that it is fully correct if it is used. 5971 auto takeAttValue(Text)(ref Text text) 5972 { 5973 // AttValue ::= '"' ([^<&"] | Reference)* '"' | "'" ([^<&'] | Reference)* "'" 5974 // Reference ::= EntityRef | CharRef 5975 // EntityRef ::= '&' Name ';' 5976 // PEReference ::= '%' Name ';' 5977 5978 import std.range : only; 5979 5980 checkNotEmpty(text); 5981 immutable quote = text.input.front; 5982 immutable quotePos = text.pos; 5983 foreach(quoteChar; only('"', '\'')) 5984 { 5985 // This would be a bit simpler if takeUntilAndDrop took a runtime 5986 // argument, but in all other cases, a compile-time argument makes more 5987 // sense, so this seemed like a reasonable way to handle this one case. 5988 if(quote == quoteChar) 5989 { 5990 popFrontAndIncCol(text); 5991 size_t lineStart = 0; 5992 auto orig = text.input.save; 5993 size_t takeLen; 5994 loop: while(true) 5995 { 5996 if(text.input.empty) 5997 throw new XMLParsingException("Unterminated attribute value", quotePos); 5998 switch(text.input.front) 5999 { 6000 case '"': 6001 { 6002 if(quote == '"') 6003 { 6004 text.input.popFront(); 6005 goto done; 6006 } 6007 goto default; 6008 } 6009 case '\'': 6010 { 6011 if(quote == '\'') 6012 { 6013 text.input.popFront(); 6014 goto done; 6015 } 6016 goto default; 6017 } 6018 case '&': 6019 { 6020 { 6021 import dxml.util : parseCharRef; 6022 auto temp = text.input.save; 6023 auto charRef = parseCharRef(temp); 6024 if(!charRef.isNull) 6025 { 6026 static if(hasLength!(Text.Input)) 6027 { 6028 takeLen += text.input.length - temp.length; 6029 text.input = temp; 6030 } 6031 else 6032 { 6033 while(text.input.front != ';') 6034 { 6035 ++takeLen; 6036 text.input.popFront(); 6037 } 6038 ++takeLen; 6039 text.input.popFront(); 6040 } 6041 continue; 6042 } 6043 } 6044 6045 immutable ampLen = takeLen - lineStart; 6046 ++takeLen; 6047 text.input.popFront(); 6048 6049 // Std Entity References 6050 static if(Text.config.throwOnEntityRef == ThrowOnEntityRef.yes) 6051 { 6052 import std.algorithm.searching : startsWith; 6053 6054 static foreach(entRef; ["amp;", "apos;", "quot;", "lt;", "gt;"]) 6055 { 6056 if(text.input.save.startsWith(entRef)) 6057 { 6058 takeLen += entRef.length; 6059 text.input.popFrontN(entRef.length); 6060 continue loop; 6061 } 6062 } 6063 6064 text.pos.col += ampLen; 6065 throw new XMLParsingException("& is only legal in an attribute value as part of a " ~ 6066 "reference, and this parser only supports entity " ~ 6067 "references if they're predefined by the spec. This is not " ~ 6068 "a valid character reference or one of the predefined " ~ 6069 "entity references.", text.pos); 6070 } 6071 // All Entity References 6072 else 6073 { 6074 import std.utf : decodeFront, UseReplacementDchar; 6075 import dxml.internal : isNameStartChar, isNameChar; 6076 6077 if(text.input.empty || text.input.front == quote) 6078 goto failedEntityRef; 6079 6080 { 6081 size_t numCodeUnits; 6082 immutable decodedC = text.input.decodeFront!(UseReplacementDchar.yes)(numCodeUnits); 6083 if(!isNameStartChar(decodedC)) 6084 goto failedEntityRef; 6085 takeLen += numCodeUnits; 6086 } 6087 6088 while(true) 6089 { 6090 if(text.input.empty) 6091 goto failedEntityRef; 6092 immutable c = text.input.front; 6093 if(c == ';') 6094 { 6095 ++takeLen; 6096 break; 6097 } 6098 size_t numCodeUnits; 6099 immutable decodedC = text.input.decodeFront!(UseReplacementDchar.yes)(numCodeUnits); 6100 if(!isNameChar(decodedC)) 6101 goto failedEntityRef; 6102 takeLen += numCodeUnits; 6103 } 6104 break; 6105 6106 failedEntityRef: 6107 text.pos.col += ampLen; 6108 throw new XMLParsingException("& is only legal in an attribute value as part of a " ~ 6109 "character or entity reference, and this is not a valid " ~ 6110 "character or entity reference.", text.pos); 6111 } 6112 } 6113 case '<': 6114 { 6115 text.pos.col += takeLen - lineStart; 6116 throw new XMLParsingException("< is not legal in an attribute name", text.pos); 6117 } 6118 case '\n': 6119 { 6120 ++takeLen; 6121 nextLine!(Text.config)(text.pos); 6122 lineStart = takeLen; 6123 break; 6124 } 6125 default: 6126 { 6127 import std.ascii : isASCII; 6128 import std.format : format; 6129 import dxml.internal : isXMLChar; 6130 6131 immutable c = text.input.front; 6132 if(isASCII(c)) 6133 { 6134 if(!isXMLChar(c)) 6135 { 6136 throw new XMLParsingException(format!"Character is not legal in an XML File: 0x%0x"(c), 6137 text.pos); 6138 } 6139 ++takeLen; 6140 break; 6141 } 6142 import std.utf : decodeFront, UseReplacementDchar, UTFException; 6143 // Annoyngly, letting decodeFront throw is the easier way to handle this, since the 6144 // replacement character is considered valid XML, and if we decoded using it, then 6145 // all of the invalid Unicode characters would come out as the replacement character 6146 // and then be treated as valid instead of being caught, which isn't all bad, but 6147 // the spec requires that they be treated as invalid instead of playing nice and 6148 // using the replacement character. 6149 try 6150 { 6151 size_t numCodeUnits; 6152 immutable decodedC = text.input.decodeFront!(UseReplacementDchar.no)(numCodeUnits); 6153 if(!isXMLChar(decodedC)) 6154 { 6155 enum fmt = "Character is not legal in an XML File: 0x%0x"; 6156 throw new XMLParsingException(format!fmt(decodedC), text.pos); 6157 } 6158 takeLen += numCodeUnits; 6159 } 6160 catch(UTFException e) 6161 throw new XMLParsingException("Invalid Unicode character", text.pos); 6162 continue; 6163 } 6164 } 6165 text.input.popFront(); 6166 } 6167 done: 6168 { 6169 import std.range : takeExactly; 6170 text.pos.col += takeLen - lineStart + 1; 6171 return takeExactly(orig, takeLen); 6172 } 6173 } 6174 } 6175 throw new XMLParsingException("Expected quoted text", text.pos); 6176 } 6177 6178 unittest 6179 { 6180 import core.exception : AssertError; 6181 import std.algorithm.comparison : equal; 6182 import std.exception : collectException, enforce; 6183 import std.range : only; 6184 import dxml.internal : codeLen, testRangeFuncs; 6185 6186 static void test(alias func, ThrowOnEntityRef toer)(string origHaystack, string expected, string remainder, 6187 int row, int col, size_t line = __LINE__) 6188 { 6189 auto haystack = func(origHaystack); 6190 auto adjExpected = expected.toCmpType!(func, toer)(); 6191 { 6192 auto text = testParser!(makeConfig(toer))(haystack.save); 6193 enforce!AssertError(equal(text.takeAttValue(), adjExpected.save), 6194 "unittest failure 1", __FILE__, line); 6195 enforce!AssertError(equal(text.input, remainder), "unittest failure 2", __FILE__, line); 6196 enforce!AssertError(text.pos == TextPos(row, col), "unittest failure 3", __FILE__, line); 6197 } 6198 { 6199 auto pos = TextPos(row + 3, row == 1 ? col + 7 : col); 6200 auto text = testParser!(makeConfig(toer))(haystack); 6201 text.pos.line += 3; 6202 text.pos.col += 7; 6203 enforce!AssertError(equal(text.takeAttValue(), adjExpected), 6204 "unittest failure 4", __FILE__, line); 6205 enforce!AssertError(equal(text.input, remainder), "unittest failure 5", __FILE__, line); 6206 enforce!AssertError(text.pos == pos, "unittest failure 6", __FILE__, line); 6207 } 6208 } 6209 6210 static void testFail(alias func, ThrowOnEntityRef toer)(string origHaystack, 6211 int row, int col, size_t line = __LINE__) 6212 { 6213 auto haystack = func(origHaystack); 6214 { 6215 auto text = testParser!(makeConfig(toer))(haystack.save); 6216 auto e = collectException!XMLParsingException(text.takeAttValue()); 6217 enforce!AssertError(e !is null, "unittest failure 1", __FILE__, line); 6218 enforce!AssertError(e.pos == TextPos(row, col), "unittest failure 2", __FILE__, line); 6219 } 6220 { 6221 auto pos = TextPos(row + 3, row == 1 ? col + 7 : col); 6222 auto text = testParser!(makeConfig(toer))(haystack); 6223 text.pos.line += 3; 6224 text.pos.col += 7; 6225 auto e = collectException!XMLParsingException(text.takeAttValue()); 6226 enforce!AssertError(e !is null, "unittest failure 1", __FILE__, line); 6227 enforce!AssertError(e.pos == pos, "unittest failure 2", __FILE__, line); 6228 } 6229 } 6230 6231 static foreach(i, func; testRangeFuncs) 6232 { 6233 static foreach(toer; [ThrowOnEntityRef.yes, ThrowOnEntityRef.no]) 6234 { 6235 test!(func, toer)(`""`, "", "", 1, 3); 6236 test!(func, toer)(`"J"`, "J", "", 1, 4); 6237 test!(func, toer)(`"foo"`, "foo", "", 1, 6); 6238 test!(func, toer)(`"プログラミング"`, "プログラミング", "", 1, codeLen!(func, "プログラミング") + 3); 6239 test!(func, toer)(`"foo"bar`, "foo", "bar", 1, 6); 6240 test!(func, toer)(`"プログラミング" after`, "プログラミング", " after", 1, codeLen!(func, "プログラミング") + 3); 6241 6242 test!(func, toer)(`''`, "", "", 1, 3); 6243 test!(func, toer)(`'J'`, "J", "", 1, 4); 6244 test!(func, toer)(`'foo'`, "foo", "", 1, 6); 6245 test!(func, toer)(`'プログラミング'`, "プログラミング", "", 1, codeLen!(func, "プログラミング") + 3); 6246 test!(func, toer)(`'foo'bar`, "foo", "bar", 1, 6); 6247 test!(func, toer)(`'プログラミング' after`, "プログラミング", " after", 1, codeLen!(func, "プログラミング") + 3); 6248 6249 test!(func, toer)(`"&><"`, "&><", "", 1, 16); 6250 test!(func, toer)(`"'""`, "'"", "", 1, 15); 6251 test!(func, toer)(`"hello&><world"`, "hello&><world", "", 1, 26); 6252 test!(func, toer)(`".....&><....."`, ".....&><.....", "", 1, 26); 6253 test!(func, toer)(`"ディラン"`, "ディラン", "", 1, 35); 6254 test!(func, toer)(`"hello¯M&world"`, "hello¯M&world", "", 1, 29); 6255 6256 test!(func, toer)(`'&><'`, "&><", "", 1, 16); 6257 test!(func, toer)(`'hello&><world'`, "hello&><world", "", 1, 26); 6258 test!(func, toer)(`''"'`, "'"", "", 1, 15); 6259 test!(func, toer)(`'.....&><.....'`, ".....&><.....", "", 1, 26); 6260 test!(func, toer)(`'ディラン'`, "ディラン", "", 1, 35); 6261 test!(func, toer)(`'hello¯M&world'`, "hello¯M&world", "", 1, 29); 6262 6263 test!(func, toer)("'hello\nworld'", "hello\nworld", "", 2, 7); 6264 test!(func, toer)("'hello\nworld\n'", "hello\nworld\n", "", 3, 2); 6265 6266 test!(func, toer)(`"'''"whatever`, "'''", "whatever", 1, 6); 6267 test!(func, toer)(`'"""'whatever`, `"""`, "whatever", 1, 6); 6268 6269 test!(func, toer)(`"*"`, "*", "", 1, 8); 6270 test!(func, toer)(`"B"`, "B", "", 1, 9); 6271 test!(func, toer)(`"%foo"`, "%foo", "", 1, 7); 6272 6273 testFail!(func, toer)(`"`, 1, 1); 6274 testFail!(func, toer)(`"foo`, 1, 1); 6275 testFail!(func, toer)(`"foo'`, 1, 1); 6276 testFail!(func, toer)(`"<"`, 1, 2); 6277 testFail!(func, toer)(`"&`, 1, 2); 6278 testFail!(func, toer)(`"&"`, 1, 2); 6279 testFail!(func, toer)(`"&x"`, 1, 2); 6280 testFail!(func, toer)(`"&.;"`, 1, 2); 6281 testFail!(func, toer)(`"&&;"`, 1, 2); 6282 testFail!(func, toer)(`"&a"`, 1, 2); 6283 testFail!(func, toer)(`"&a`, 1, 2); 6284 testFail!(func, toer)(`"hello&;"`, 1, 7); 6285 testFail!(func, toer)(`"hello&;world"`,1, 7); 6286 testFail!(func, toer)(`"hello&<;world"`,1, 7); 6287 testFail!(func, toer)(`"hello&world"`,1, 7); 6288 testFail!(func, toer)(`"hello<world"`,1, 7); 6289 testFail!(func, toer)(`"hello world&"`, 1, 13); 6290 testFail!(func, toer)(`"hello world&;"`, 1, 13); 6291 testFail!(func, toer)(`"hello world&foo"`, 1, 13); 6292 testFail!(func, toer)(`"foo<"`, 1, 5); 6293 testFail!(func, toer)(`"&#`, 1, 2); 6294 testFail!(func, toer)(`"&#"`, 1, 2); 6295 testFail!(func, toer)(`"&#;"`, 1, 2); 6296 testFail!(func, toer)(`"&#x;"`, 1, 2); 6297 testFail!(func, toer)(`"&#AF;"`, 1, 2); 6298 testFail!(func, toer)(`"&#x`, 1, 2); 6299 testFail!(func, toer)(`"M`, 1, 2); 6300 testFail!(func, toer)(`"M`, 1, 1); 6301 testFail!(func, toer)(`"�`, 1, 2); 6302 testFail!(func, toer)(`"�`, 1, 2); 6303 testFail!(func, toer)(`"�"`, 1, 2); 6304 6305 testFail!(func, toer)(`'`, 1, 1); 6306 testFail!(func, toer)(`'foo`, 1, 1); 6307 testFail!(func, toer)(`'foo"`, 1, 1); 6308 testFail!(func, toer)(`'<'`, 1, 2); 6309 testFail!(func, toer)("'\v'", 1, 2); 6310 testFail!(func, toer)("'\uFFFE'", 1, 2); 6311 testFail!(func, toer)(`'&`, 1, 2); 6312 testFail!(func, toer)(`'&'`, 1, 2); 6313 testFail!(func, toer)(`'&x'`, 1, 2); 6314 testFail!(func, toer)(`'&.;'`, 1, 2); 6315 testFail!(func, toer)(`'&&;'`, 1, 2); 6316 testFail!(func, toer)(`'&a'`, 1, 2); 6317 testFail!(func, toer)(`'&a`, 1, 2); 6318 testFail!(func, toer)(`'hello&;'`, 1, 7); 6319 testFail!(func, toer)(`'hello&;world'`, 1, 7); 6320 testFail!(func, toer)(`'hello&<;world'`, 1, 7); 6321 testFail!(func, toer)(`'hello&world'`, 1, 7); 6322 testFail!(func, toer)(`'hello<world'`, 1, 7); 6323 testFail!(func, toer)(`'hello world&'`, 1, 13); 6324 testFail!(func, toer)(`'hello world&;'`, 1, 13); 6325 testFail!(func, toer)(`'hello world&foo'`, 1, 13); 6326 testFail!(func, toer)(`'foo<'`, 1, 5); 6327 testFail!(func, toer)(`'&#`, 1, 2); 6328 testFail!(func, toer)(`'&#'`, 1, 2); 6329 testFail!(func, toer)(`'&#;'`, 1, 2); 6330 testFail!(func, toer)(`'&#x;'`, 1, 2); 6331 testFail!(func, toer)(`'&#AF;'`, 1, 2); 6332 testFail!(func, toer)(`'&#x`, 1, 2); 6333 testFail!(func, toer)(`'M`, 1, 2); 6334 testFail!(func, toer)(`'M`, 1, 1); 6335 testFail!(func, toer)(`'�`, 1, 2); 6336 testFail!(func, toer)(`'�`, 1, 2); 6337 testFail!(func, toer)(`'�'`, 1, 2); 6338 testFail!(func, toer)("'
\nF;'", 1, 2); 6339 testFail!(func, toer)("'&\n;'", 1, 2); 6340 testFail!(func, toer)("'&\namp;'", 1, 2); 6341 testFail!(func, toer)("'\n&&;'", 2, 6); 6342 } 6343 { 6344 alias toer = ThrowOnEntityRef.yes; 6345 testFail!(func, toer)(`"&foo;"`, 1, 2); 6346 testFail!(func, toer)(`"hello world&foo;"`, 1, 13); 6347 testFail!(func, toer)(`"hello &foo; world"`, 1, 8); 6348 testFail!(func, toer)(`"&am;"`, 1, 2); 6349 testFail!(func, toer)(`"&e;"`, 1, 2); 6350 testFail!(func, toer)(`"&l;"`, 1, 2); 6351 testFail!(func, toer)(`"<e;"`, 1, 2); 6352 testFail!(func, toer)(`"&g;"`, 1, 2); 6353 testFail!(func, toer)(`">e;"`, 1, 2); 6354 testFail!(func, toer)(`"&apo;"`, 1, 2); 6355 testFail!(func, toer)(`"&aposs;"`, 1, 2); 6356 testFail!(func, toer)(`"&quo;"`, 1, 2); 6357 testFail!(func, toer)(`""e;"`, 1, 2); 6358 6359 testFail!(func, toer)(`'&foo;'`, 1, 2); 6360 testFail!(func, toer)(`'hello world&foo;'`, 1, 13); 6361 testFail!(func, toer)(`'hello &foo; world'`, 1, 8); 6362 testFail!(func, toer)(`'&am;'`, 1, 2); 6363 testFail!(func, toer)(`'&e;'`, 1, 2); 6364 testFail!(func, toer)(`'&l;'`, 1, 2); 6365 testFail!(func, toer)(`'<e;'`, 1, 2); 6366 testFail!(func, toer)(`'&g;'`, 1, 2); 6367 testFail!(func, toer)(`'>e;'`, 1, 2); 6368 testFail!(func, toer)(`'&apo;'`, 1, 2); 6369 testFail!(func, toer)(`'&aposs;'`, 1, 2); 6370 testFail!(func, toer)(`'&quo;'`, 1, 2); 6371 testFail!(func, toer)(`'"e;'`, 1, 2); 6372 } 6373 { 6374 alias toer = ThrowOnEntityRef.no; 6375 test!(func, toer)(`"&foo;"`, "&foo;", "", 1, 8); 6376 test!(func, toer)(`"hello world&foo;"`, "hello world&foo;", "", 1, 19); 6377 test!(func, toer)(`"hello &foo; world"`, "hello &foo; world", "", 1, 20); 6378 test!(func, toer)(`"&am;"`, "&am;", "", 1, 7); 6379 test!(func, toer)(`"&e;"`, "&e;", "", 1, 9); 6380 test!(func, toer)(`"&l;"`, "&l;", "", 1, 6); 6381 test!(func, toer)(`"<e;"`, "<e;", "", 1, 8); 6382 test!(func, toer)(`"&g;"`, "&g;", "", 1, 6); 6383 test!(func, toer)(`">e;"`, ">e;", "", 1, 8); 6384 test!(func, toer)(`"&apo;"`, "&apo;", "", 1, 8); 6385 test!(func, toer)(`"&aposs;"`, "&aposs;", "", 1, 10); 6386 test!(func, toer)(`"&quo;"`, "&quo;", "", 1, 8); 6387 test!(func, toer)(`""e;"`, ""e;", "", 1, 10); 6388 6389 test!(func, toer)(`'&foo;'`, "&foo;", "", 1, 8); 6390 test!(func, toer)(`'hello world&foo;'`, "hello world&foo;", "", 1, 19); 6391 test!(func, toer)(`'hello &foo; world'`, "hello &foo; world", "", 1, 20); 6392 test!(func, toer)(`'&am;'`, "&am;", "", 1, 7); 6393 test!(func, toer)(`'&e;'`, "&e;", "", 1, 9); 6394 test!(func, toer)(`'&l;'`, "&l;", "", 1, 6); 6395 test!(func, toer)(`'<e;'`, "<e;", "", 1, 8); 6396 test!(func, toer)(`'&g;'`, "&g;", "", 1, 6); 6397 test!(func, toer)(`'>e;'`, ">e;", "", 1, 8); 6398 test!(func, toer)(`'&apo;'`, "&apo;", "", 1, 8); 6399 test!(func, toer)(`'&aposs;'`, "&aposs;", "", 1, 10); 6400 test!(func, toer)(`'&quo;'`, "&quo;", "", 1, 8); 6401 test!(func, toer)(`'"e;'`, ""e;", "", 1, 10); 6402 } 6403 } 6404 6405 // These can't be tested with testFail, because attempting to convert 6406 // invalid Unicode results in UnicodeExceptions before parseXML even 6407 // gets called. 6408 import std.meta : AliasSeq; 6409 static foreach(str; AliasSeq!("'" ~ cast(string)[255] ~ "'", 6410 "'"w ~ cast(wstring)[0xD800] ~ "'", 6411 "'"d ~ cast(dstring)[0xD800] ~ "'")) 6412 {{ 6413 auto text = testParser(str); 6414 auto e = collectException!XMLParsingException(text.takeAttValue()); 6415 assert(e ! is null); 6416 assert(e.pos == TextPos(1, 2)); 6417 }} 6418 } 6419 6420 @safe pure unittest 6421 { 6422 import std.algorithm.comparison : equal; 6423 import dxml.internal : testRangeFuncs; 6424 6425 static foreach(func; testRangeFuncs) 6426 { 6427 static foreach(config; [Config.init, simpleXML, makeConfig(ThrowOnEntityRef.no)]) 6428 {{ 6429 auto xml = func(`'foo'`); 6430 auto text = testParser!simpleXML(xml); 6431 assert(equal(text.takeAttValue(), "foo")); 6432 }} 6433 } 6434 } 6435 6436 6437 // Validates an EntityType.text field to verify that it does not contain invalid 6438 // characters. 6439 void checkText(bool allowRestrictedChars, Text)(ref Text orig) 6440 { 6441 import std.format : format; 6442 import std.utf : decodeFront, UseReplacementDchar; 6443 6444 auto text = orig.save; 6445 loop: while(!text.input.empty) 6446 { 6447 switch(text.input.front) 6448 { 6449 static if(!allowRestrictedChars) 6450 { 6451 case '&': 6452 { 6453 import dxml.util : parseCharRef; 6454 6455 { 6456 auto temp = text.input.save; 6457 auto charRef = parseCharRef(temp); 6458 if(!charRef.isNull) 6459 { 6460 static if(hasLength!(Text.Input)) 6461 { 6462 text.pos.col += text.input.length - temp.length; 6463 text.input = temp; 6464 } 6465 else 6466 { 6467 while(text.input.front != ';') 6468 popFrontAndIncCol(text); 6469 popFrontAndIncCol(text); 6470 } 6471 continue; 6472 } 6473 } 6474 6475 immutable ampPos = text.pos; 6476 popFrontAndIncCol(text); 6477 6478 // Std Entity References 6479 static if(Text.config.throwOnEntityRef == ThrowOnEntityRef.yes) 6480 { 6481 static foreach(entRef; ["amp;", "apos;", "quot;", "lt;", "gt;"]) 6482 { 6483 if(text.stripStartsWith(entRef)) 6484 continue loop; 6485 } 6486 6487 throw new XMLParsingException("& is only legal in an EntitType.text entity as part of a " ~ 6488 "reference, and this parser only supports entity references if " ~ 6489 "they're predefined by the spec. This is not a valid character " ~ 6490 "reference or one of the predefined entity references.", ampPos); 6491 } 6492 // All Entity References 6493 else 6494 { 6495 import std.utf : decodeFront, UseReplacementDchar; 6496 import dxml.internal : isNameStartChar, isNameChar; 6497 6498 if(text.input.empty) 6499 goto failedEntityRef; 6500 { 6501 size_t numCodeUnits; 6502 immutable decodedC = text.input.decodeFront!(UseReplacementDchar.yes)(numCodeUnits); 6503 if(!isNameStartChar(decodedC)) 6504 goto failedEntityRef; 6505 text.pos.col += numCodeUnits; 6506 } 6507 while(true) 6508 { 6509 if(text.input.empty) 6510 goto failedEntityRef; 6511 immutable c = text.input.front; 6512 if(c == ';') 6513 break; 6514 size_t numCodeUnits; 6515 immutable decodedC = text.input.decodeFront!(UseReplacementDchar.yes)(numCodeUnits); 6516 if(!isNameChar(decodedC)) 6517 goto failedEntityRef; 6518 text.pos.col += numCodeUnits; 6519 } 6520 assert(text.input.front == ';'); 6521 popFrontAndIncCol(text); 6522 continue; 6523 6524 failedEntityRef: 6525 throw new XMLParsingException("& is only legal in an attribute value as part of a " ~ 6526 "character or entity reference, and this is not a valid " ~ 6527 "character or entity reference.", ampPos); 6528 } 6529 } 6530 case '<': throw new XMLParsingException("< is not legal in EntityType.text", text.pos); 6531 case ']': 6532 { 6533 popFrontAndIncCol(text); 6534 if(text.stripStartsWith("]>")) 6535 { 6536 text.pos.col -= 3; 6537 throw new XMLParsingException("]]> is not legal in EntityType.text", text.pos); 6538 } 6539 break; 6540 } 6541 } 6542 case '\n': 6543 { 6544 nextLine!(text.config)(text.pos); 6545 text.input.popFront(); 6546 break; 6547 } 6548 default: 6549 { 6550 import std.ascii : isASCII; 6551 import dxml.internal : isXMLChar; 6552 immutable c = text.input.front; 6553 if(isASCII(c)) 6554 { 6555 if(!isXMLChar(c)) 6556 { 6557 throw new XMLParsingException(format!"Character is not legal in an XML File: 0x%0x"(c), 6558 text.pos); 6559 } 6560 popFrontAndIncCol(text); 6561 } 6562 else 6563 { 6564 import std.utf : UTFException; 6565 // Annoyngly, letting decodeFront throw is the easier way to handle this, since the 6566 // replacement character is considered valid XML, and if we decoded using it, then 6567 // all of the invalid Unicode characters would come out as the replacement character 6568 // and then be treated as valid instead of being caught, which isn't all bad, but 6569 // the spec requires that they be treated as invalid instead of playing nice and 6570 // using the replacement character. 6571 try 6572 { 6573 size_t numCodeUnits; 6574 immutable decodedC = text.input.decodeFront!(UseReplacementDchar.no)(numCodeUnits); 6575 if(!isXMLChar(decodedC)) 6576 { 6577 enum fmt = "Character is not legal in an XML File: 0x%0x"; 6578 throw new XMLParsingException(format!fmt(decodedC), text.pos); 6579 } 6580 text.pos.col += numCodeUnits; 6581 } 6582 catch(UTFException) 6583 throw new XMLParsingException("Invalid Unicode character", text.pos); 6584 } 6585 break; 6586 } 6587 } 6588 } 6589 } 6590 6591 unittest 6592 { 6593 import core.exception : AssertError; 6594 import std.exception : assertNotThrown, collectException, enforce; 6595 import dxml.internal : codeLen, testRangeFuncs; 6596 6597 static void test(alias func, bool arc, ThrowOnEntityRef toer)(string text, size_t line = __LINE__) 6598 { 6599 auto xml = func(text); 6600 auto range = testParser!(makeConfig(toer))(xml); 6601 assertNotThrown(checkText!arc(range), "unittest failure", __FILE__, line); 6602 } 6603 6604 static void testFail(alias func, bool arc, ThrowOnEntityRef toer)(string text, int row, int col, size_t line = __LINE__) 6605 { 6606 auto xml = func(text); 6607 { 6608 auto range = testParser!(makeConfig(toer))(xml.save); 6609 auto e = collectException!XMLParsingException(checkText!arc(range)); 6610 enforce!AssertError(e !is null, "unittest failure 1", __FILE__, line); 6611 enforce!AssertError(e.pos == TextPos(row, col), "unittest failure 2", __FILE__, line); 6612 } 6613 { 6614 auto pos = TextPos(row + 3, row == 1 ? col + 7 : col); 6615 auto range = testParser!(makeConfig(toer))(xml); 6616 range.pos.line += 3; 6617 range.pos.col += 7; 6618 auto e = collectException!XMLParsingException(checkText!arc(range)); 6619 enforce!AssertError(e !is null, "unittest failure 3", __FILE__, line); 6620 enforce!AssertError(e.pos == pos, "unittest failure 4", __FILE__, line); 6621 } 6622 } 6623 6624 static foreach(func; testRangeFuncs) 6625 { 6626 static foreach(toer; [ThrowOnEntityRef.yes, ThrowOnEntityRef.no]) 6627 { 6628 static foreach(arc; [false, true]) 6629 { 6630 test!(func, arc, toer)(""); 6631 test!(func, arc, toer)("J",); 6632 test!(func, arc, toer)("foo"); 6633 test!(func, arc, toer)("プログラミング"); 6634 6635 test!(func, arc, toer)("&><"); 6636 test!(func, arc, toer)("hello&><world"); 6637 test!(func, arc, toer)(".....'"&....."); 6638 test!(func, arc, toer)("ディラン"); 6639 test!(func, arc, toer)("hello¯*"world"); 6640 6641 test!(func, arc, toer)("]]"); 6642 test!(func, arc, toer)("]>"); 6643 test!(func, arc, toer)("foo]]bar"); 6644 test!(func, arc, toer)("foo]>bar"); 6645 test!(func, arc, toer)("]] >"); 6646 6647 testFail!(func, arc, toer)("\v", 1, 1); 6648 testFail!(func, arc, toer)("\uFFFE", 1, 1); 6649 testFail!(func, arc, toer)("hello\vworld", 1, 6); 6650 testFail!(func, arc, toer)("he\nllo\vwo\nrld", 2, 4); 6651 } 6652 6653 testFail!(func, false, toer)("<", 1, 1); 6654 testFail!(func, false, toer)("&", 1, 1); 6655 testFail!(func, false, toer)("&", 1, 1); 6656 testFail!(func, false, toer)("&x", 1, 1); 6657 testFail!(func, false, toer)("&&;", 1, 1); 6658 testFail!(func, false, toer)("&a", 1, 1); 6659 testFail!(func, false, toer)("hello&;", 1, 6); 6660 testFail!(func, false, toer)("hello&;world", 1, 6); 6661 testFail!(func, false, toer)("hello&<;world", 1, 6); 6662 testFail!(func, false, toer)("hello&world", 1, 6); 6663 testFail!(func, false, toer)("hello world&", 1, 12); 6664 testFail!(func, false, toer)("hello world&;", 1, 12); 6665 testFail!(func, false, toer)("hello world&foo", 1, 12); 6666 testFail!(func, false, toer)("&#;", 1, 1); 6667 testFail!(func, false, toer)("&#x;", 1, 1); 6668 testFail!(func, false, toer)("&#AF;", 1, 1); 6669 testFail!(func, false, toer)("&#x", 1, 1); 6670 testFail!(func, false, toer)("*", 1, 1); 6671 testFail!(func, false, toer)("B", 1, 1); 6672 testFail!(func, false, toer)("", 1, 1); 6673 testFail!(func, false, toer)("", 1, 1); 6674 testFail!(func, false, toer)("*foo\nbar&#;", 2, 4); 6675 testFail!(func, false, toer)("*foo\nbar&#x;", 2, 4); 6676 testFail!(func, false, toer)("*foo\nbar&#AF;", 2, 4); 6677 testFail!(func, false, toer)("*foo\nbar&#x", 2, 4); 6678 testFail!(func, false, toer)("*foo\nbar*", 2, 4); 6679 testFail!(func, false, toer)("*foo\nbarB", 2, 4); 6680 testFail!(func, false, toer)("プログラミング&", 1, codeLen!(func, "プログラミング&")); 6681 6682 static if(toer == ThrowOnEntityRef.yes) 6683 { 6684 testFail!(func, false, toer)("&a;", 1, 1); 6685 testFail!(func, false, toer)(`&am;`, 1, 1); 6686 testFail!(func, false, toer)(`&e;`, 1, 1); 6687 testFail!(func, false, toer)(`&l;`, 1, 1); 6688 testFail!(func, false, toer)(`<e;`, 1, 1); 6689 testFail!(func, false, toer)(`&g;`, 1, 1); 6690 testFail!(func, false, toer)(`>e;`, 1, 1); 6691 testFail!(func, false, toer)(`&apo;`, 1, 1); 6692 testFail!(func, false, toer)(`&aposs;`, 1, 1); 6693 testFail!(func, false, toer)(`&quo;`, 1, 1); 6694 testFail!(func, false, toer)(`"e;`, 1, 1); 6695 testFail!(func, false, toer)(`hello &foo; world`, 1, 7); 6696 testFail!(func, false, toer)("hello\n &foo; \nworld", 2, 2); 6697 } 6698 else 6699 { 6700 test!(func, false, toer)("&a;"); 6701 test!(func, false, toer)(`&am;`); 6702 test!(func, false, toer)(`&e;`); 6703 test!(func, false, toer)(`&l;`); 6704 test!(func, false, toer)(`<e;`); 6705 test!(func, false, toer)(`&g;`); 6706 test!(func, false, toer)(`>e;`); 6707 test!(func, false, toer)(`&apo;`); 6708 test!(func, false, toer)(`&aposs;`); 6709 test!(func, false, toer)(`&quo;`); 6710 test!(func, false, toer)(`"e;`); 6711 test!(func, false, toer)(`hello &foo; world`); 6712 test!(func, false, toer)("hello\n &foo; \nworld"); 6713 } 6714 6715 testFail!(func, false, toer)("]]>", 1, 1); 6716 testFail!(func, false, toer)("foo]]>bar", 1, 4); 6717 6718 test!(func, true, toer)("]]>"); 6719 test!(func, true, toer)("foo]]>bar"); 6720 6721 test!(func, true, toer)("<"); 6722 test!(func, true, toer)("&"); 6723 test!(func, true, toer)("&x"); 6724 test!(func, true, toer)("&&;"); 6725 test!(func, true, toer)("&a"); 6726 test!(func, true, toer)("&a;"); 6727 test!(func, true, toer)(`&am;`); 6728 test!(func, true, toer)(`&e;`); 6729 test!(func, true, toer)(`&l;`); 6730 test!(func, true, toer)(`<e;`); 6731 test!(func, true, toer)(`&g;`); 6732 test!(func, true, toer)(`>e;`); 6733 test!(func, true, toer)(`&apo;`); 6734 test!(func, true, toer)(`&aposs;`); 6735 test!(func, true, toer)(`&quo;`); 6736 test!(func, true, toer)(`"e;`); 6737 test!(func, true, toer)("hello&;"); 6738 test!(func, true, toer)("hello&;world"); 6739 test!(func, true, toer)("hello&<;world"); 6740 test!(func, true, toer)("hello&world"); 6741 test!(func, true, toer)("hello world&"); 6742 test!(func, true, toer)("hello world&;"); 6743 test!(func, true, toer)("hello world&foo"); 6744 test!(func, true, toer)("&#;"); 6745 test!(func, true, toer)("&#x;"); 6746 test!(func, true, toer)("&#AF;"); 6747 test!(func, true, toer)("&#x"); 6748 test!(func, true, toer)("*"); 6749 test!(func, true, toer)("B"); 6750 test!(func, true, toer)(""); 6751 test!(func, true, toer)(""); 6752 test!(func, true, toer)("*foo\nbar&#;"); 6753 test!(func, true, toer)("*foo\nbar&#x;"); 6754 test!(func, true, toer)("*foo\nbar&#AF;"); 6755 test!(func, true, toer)("*foo\nbar&#x"); 6756 test!(func, true, toer)("*foo\nbar*"); 6757 test!(func, true, toer)("*foo\nbarB"); 6758 test!(func, true, toer)("プログラミング&"); 6759 } 6760 } 6761 6762 // These can't be tested with testFail, because attempting to convert 6763 // invalid Unicode results in UnicodeExceptions before parseXML even 6764 // gets called. 6765 import std.meta : AliasSeq; 6766 static foreach(str; AliasSeq!(cast(string)[255], cast(wstring)[0xD800], cast(dstring)[0xD800])) 6767 { 6768 static foreach(arc; [false, true]) 6769 {{ 6770 auto text = testParser(str); 6771 auto e = collectException!XMLParsingException(text.checkText!arc()); 6772 assert(e ! is null); 6773 assert(e.pos == TextPos(1, 1)); 6774 }} 6775 } 6776 } 6777 6778 @safe unittest 6779 { 6780 import dxml.internal : testRangeFuncs; 6781 6782 static foreach(func; testRangeFuncs) 6783 { 6784 static foreach(arc; [false, true]) 6785 { 6786 static foreach(config; [Config.init, simpleXML, makeConfig(ThrowOnEntityRef.no)]) 6787 {{ 6788 auto xml = func("foo"); 6789 auto text = testParser!config(xml); 6790 checkText!arc(text); 6791 }} 6792 } 6793 } 6794 } 6795 6796 6797 // S := (#x20 | #x9 | #xD | #XA)+ 6798 bool isSpace(C)(C c) @safe pure nothrow @nogc 6799 if(isSomeChar!C) 6800 { 6801 switch(c) 6802 { 6803 case ' ': 6804 case '\t': 6805 case '\r': 6806 case '\n': return true; 6807 default : return false; 6808 } 6809 } 6810 6811 pure nothrow @safe @nogc unittest 6812 { 6813 foreach(char c; char.min .. char.max) 6814 { 6815 if(c == ' ' || c == '\t' || c == '\r' || c == '\n') 6816 assert(isSpace(c)); 6817 else 6818 assert(!isSpace(c)); 6819 } 6820 foreach(wchar c; wchar.min .. wchar.max / 100) 6821 { 6822 if(c == ' ' || c == '\t' || c == '\r' || c == '\n') 6823 assert(isSpace(c)); 6824 else 6825 assert(!isSpace(c)); 6826 } 6827 foreach(dchar c; dchar.min .. dchar.max / 1000) 6828 { 6829 if(c == ' ' || c == '\t' || c == '\r' || c == '\n') 6830 assert(isSpace(c)); 6831 else 6832 assert(!isSpace(c)); 6833 } 6834 } 6835 6836 6837 pragma(inline, true) void popFrontAndIncCol(Text)(ref Text text) 6838 { 6839 text.input.popFront(); 6840 ++text.pos.col; 6841 } 6842 6843 pragma(inline, true) void nextLine(Config config)(ref TextPos pos) 6844 { 6845 ++pos.line; 6846 pos.col = 1; 6847 } 6848 6849 // TODO create bug report, because this function cannot be inlined 6850 /+pragma(inline, true)+/ void checkNotEmpty(Text)(ref Text text, size_t line = __LINE__) 6851 { 6852 if(text.input.empty) 6853 throw new XMLParsingException("Prematurely reached end of document", text.pos, __FILE__, line); 6854 } 6855 6856 6857 version(unittest) 6858 enum someTestConfigs = [Config.init, simpleXML, makeConfig(SkipComments.yes), makeConfig(SkipPI.yes)]; 6859 6860 6861 // Fuzz-testing failures 6862 unittest 6863 { 6864 static void parseEverything(string xml) 6865 { 6866 with(EntityType) foreach(entity; parseXML(xml)) 6867 { 6868 final switch(entity.type) 6869 { 6870 case cdata: break; 6871 case comment: break; 6872 case elementStart: auto name = entity.name; break; 6873 case elementEnd: goto case elementStart; 6874 case elementEmpty: goto case elementStart; 6875 case pi: goto case elementStart; 6876 case text: break; 6877 } 6878 6879 final switch(entity.type) 6880 { 6881 case cdata: auto text = entity.text; break; 6882 case comment: goto case cdata; 6883 case elementStart: 6884 { 6885 foreach(attr; entity.attributes) 6886 { 6887 auto name = attr.name; 6888 auto value = attr.value; 6889 } 6890 break; 6891 } 6892 case elementEnd: break; 6893 case elementEmpty: goto case elementStart; 6894 case pi: goto case cdata; 6895 case text: goto case cdata; 6896 } 6897 } 6898 } 6899 6900 static void testFail(string xml, size_t line = __LINE__) 6901 { 6902 import std.exception : assertThrown; 6903 assertThrown!XMLParsingException(parseEverything(xml)); 6904 } 6905 6906 testFail([0x3c, 0xff, 0x3e, 0x3e, 0x3a, 0x3c, 0x2f, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 6907 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 6908 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 6909 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 6910 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x31, 0xff, 6911 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xd8, 0xd8, 0xd8, 0xd8, 0xd8, 0xff, 0xff, 6912 0xff]); 6913 }