1 // Written in the D programming language 2 3 /++ 4 This module contains helper functions which aren't specific to the parser, 5 the DOM, or the writer. 6 7 $(TABLE 8 $(TR $(TH Symbol) $(TH Description)) 9 $(TR $(TD $(LREF decodeXML)) 10 $(TD Takes a range of characters, strips carriage returns from it, 11 and converts both character references and the predefined 12 entity references in the range into the characters that they 13 refer to.)) 14 $(TR $(TD $(LREF asDecodedXML)) 15 $(TD The version of $(LREF decodeXML) that returns a lazy range.)) 16 $(TR $(TD $(LREF parseCharRef)) 17 $(TD Parses a character reference from the front of a range of 18 characters.)) 19 $(TR $(TD $(LREF parseStdEntityRef)) 20 $(TD Parses one of the predefined entity references from the start 21 of a range of characters.)) 22 $(TR $(TD $(LREF stripIndent)) 23 $(TD Removes the indent from the front of each line of a range of 24 characters that was XML text which was formatted for 25 human-readability.)) 26 $(TR $(TD $(LREF withoutIndent)) 27 $(TD The version of $(LREF stripIndent) that returns a lazy 28 range.)) 29 $(TR $(TD $(LREF StdEntityRef)) 30 $(TD Enum containing the string representations of the five, 31 predefined entity references.)) 32 $(TR $(TD $(LREF encodeText)) 33 $(TD Encodes characters which cannot appear in 34 $(REF_ALTTEXT EntityType.text, EntityType.text, dxml, parser) 35 in their literal form.)) 36 $(TR $(TD $(LREF encodeAttr)) 37 $(TD Encodes characters which cannot appear in the attribute value 38 of an element start tag in their literal form.)) 39 $(TR $(TD $(LREF encodeCharRef)) 40 $(TD Encodes a character as a character reference.)) 41 ) 42 43 Copyright: Copyright 2018 - 2023 44 License: $(HTTP www.boost.org/LICENSE_1_0.txt, Boost License 1.0). 45 Authors: $(HTTPS jmdavisprog.com, Jonathan M Davis) 46 Source: $(LINK_TO_SRC dxml/_util.d) 47 48 See_Also: $(LINK2 http://www.w3.org/TR/REC-xml/, Official Specification for XML 1.0) 49 +/ 50 module dxml.util; 51 52 import std.range.primitives; 53 import std.traits; 54 import std.typecons : Nullable; 55 56 /++ 57 Decodes any XML character references and standard XML entity references in 58 the text as well as removing any carriage returns. It's intended to be used 59 on the text fields of element tags and on the values of start tag 60 attributes. 61 62 There are a number of characters that either can't be directly represented 63 in the text fields or attribute values in XML or which can sometimes be 64 directly represented but not always (e.g. an attribute value can contain 65 either a single quote or a double quote, but it can't contain both at the 66 same time, because one of them would match the opening quote). So, those 67 characters have alternate representations in order to be allowed (e.g. 68 $(D_CODE_STRING "$(AMP)lt;") for $(D_CODE_STRING '<'), because 69 $(D_CODE_STRING '<') would normally be the beginning of an entity). 70 Technically, they're entity references, but the ones handled by decodeXML 71 are the ones explicitly defined in the XML standard and which don't require 72 a DTD section. 73 74 Ideally, the parser would transform all such alternate representations to 75 what they represent when providing the text to the application, but that 76 would make it impossible to return slices of the original text from the 77 properties of an $(REF_ALTTEXT Entity, EntityRange.Entity, dxml, parser). 78 So, instead of having those properties do the transformation themselves, 79 decodeXML and asDecodedXML do that so that the application can choose to do 80 it or not (in many cases, there is nothing to decode, making the calls 81 unnecessary). 82 83 Similarly, an application can choose to encode a character as a character 84 reference (e.g. $(D_CODE_STRING '$(AMP)#65") or 85 $(D_CODE_STRING '$(AMP)#x40") for $(D_CODE_STRING 'A')). decodeXML will 86 decode such character references to their corresponding characters. 87 88 However, decodeXML does not handle any entity references beyond the five 89 predefined ones listed below. All others are left unprocessed. Processing 90 them properly would require handling the DTD section, which dxml does not 91 support. The parser considers any entity references other than the 92 predefined ones to be invalid XML, so unless the text being passed to 93 decodeXML doesn't come from dxml's parser, it can't have any entity 94 references in it other than the predefined ones. Similarly, invalid 95 character references are left unprocessed as well as any character that is 96 not valid in an XML document. decodeXML never throws on invalid XML. 97 98 Also, $(D_CODE_STRING '\r') is not supposed to appear in an XML document 99 except as a character reference unless it's in a CDATA section. So, it 100 really should be stripped out before being handed off to the application, 101 but again, that doesn't work with slices. So, decodeXML also handles that. 102 103 Specifically, what decodeXML and asDecodedXML do is 104 105 $(TABLE 106 $(TR $(TD convert $(D_CODE_STRING $(AMP)amp;) to $(D_CODE_STRING &))) 107 $(TR $(TD convert $(D_CODE_STRING $(AMP)gt;) to $(D_CODE_STRING >))) 108 $(TR $(TD convert $(D_CODE_STRING $(AMP)lt;) to $(D_CODE_STRING <))) 109 $(TR $(TD convert $(D_CODE_STRING $(AMP)apos;) to $(D_CODE_STRING '))) 110 $(TR $(TD convert $(D_CODE_STRING $(AMP)quot;) to $(D_CODE_STRING "))) 111 $(TR $(TD remove all instances of $(D_CODE_STRING \r))) 112 $(TR $(TD convert all character references (e.g. 113 $(D_CODE_STRING $(AMP)#xA;)) to the characters that they 114 represent)) 115 ) 116 117 All other entity references are left untouched, and any $(D_CODE_STRING '&') 118 which is not used in one of the constructs listed in the table as well as 119 any malformed constructs (e.g. $(D_CODE_STRING "&Amp;") or 120 $(D_CODE_STRING "&#xGGA2;")) are left untouched. 121 122 The difference between decodeXML and asDecodedXML is that decodeXML returns 123 a $(K_STRING), whereas asDecodedXML returns a lazy _range of code 124 units. In the case where a $(K_STRING) is passed to decodeXML, it 125 will simply return the original $(K_STRING) if there is no text to decode 126 (whereas in other cases, decodeXML and asDecodedXML are forced to return 127 new ranges even if there is no text to decode). 128 129 Params: 130 range = The _range of characters to decodeXML. 131 132 Returns: The decoded text. decodeXML returns a $(K_STRING), whereas 133 asDecodedXML returns a lazy _range of code units (so it could be a 134 _range of $(K_CHAR) or $(K_WCHAR) and not just $(K_DCHAR); which it 135 is depends on the code units of the _range being passed in). 136 137 See_Also: $(LINK http://www.w3.org/TR/REC-xml/#dt-chardata)$(BR) 138 $(LREF parseStdEntityRef)$(BR) 139 $(LREF parseCharRef)$(BR) 140 $(REF EntityRange.Entity.attributes, dxml, parser)$(BR) 141 $(REF EntityRange.Entity.text, dxml, parser)$(BR) 142 $(LREF encodeAttr)$(BR) 143 $(LREF encodeText) 144 +/ 145 string decodeXML(R)(R range) 146 if(isForwardRange!R && isSomeChar!(ElementType!R)) 147 { 148 import std.conv : to; 149 150 static if(isDynamicArray!R && is(Unqual!(ElementEncodingType!R) == char)) 151 { 152 import std.algorithm.searching : find, startsWith; 153 import std.array : appender; 154 import std.meta : AliasSeq; 155 156 auto found = range.find('&', '\r'); 157 if(found[1] == 0) 158 return range.to!string(); 159 160 auto retval = appender!string(); 161 retval.reserve(range.length); 162 put(retval, range[0 .. $ - found[0].length]); 163 range = range[$ - found[0].length .. $]; 164 165 size_t i = 0; 166 loop: for(; i != range.length;) 167 { 168 switch(range[i]) 169 { 170 case '&': 171 { 172 if(i + 1 == range.length) 173 { 174 ++i; 175 break loop; 176 } 177 put(retval, range[0 .. i]); 178 range = range[i .. $]; 179 i = 0; 180 static foreach(func; AliasSeq!(parseStdEntityRef, parseCharRef)) 181 {{ 182 immutable c = func(range); 183 if(!c.isNull) 184 { 185 put(retval, c.get); 186 continue loop; 187 } 188 }} 189 put(retval, '&'); 190 range = range[1 .. $]; 191 continue; 192 } 193 case '\r': 194 { 195 if(i != 0) 196 { 197 put(retval, range[0 .. i]); 198 range = range[i + 1 .. $]; 199 i = 0; 200 } 201 else 202 range = range[1 .. $]; 203 continue; 204 } 205 default: ++i; continue; 206 } 207 } 208 209 if(i != 0) 210 put(retval, range[0 .. i]); 211 212 return retval.data; 213 } 214 else 215 return range.asDecodedXML().to!string(); 216 } 217 218 219 /// Ditto 220 auto asDecodedXML(R)(R range) 221 if(isForwardRange!R && isSomeChar!(ElementType!R)) 222 { 223 import std.meta : AliasSeq; 224 import std.utf : byCodeUnit, encode, UseReplacementDchar; 225 226 static struct DecodedXML 227 { 228 public: 229 230 @property empty() { return _range.empty && _begin == _end; } 231 232 void popFront() 233 { 234 if(_begin != _end) 235 { 236 if(++_begin != _end) 237 return; 238 } 239 else 240 _range.popFront(); 241 _popFrontImpl(); 242 } 243 244 @property save() 245 { 246 auto retval = this; 247 retval._range = _range.save; 248 return retval; 249 } 250 251 private: 252 253 void _popFrontImpl() 254 { 255 while(!_range.empty) 256 { 257 switch(_range.front) 258 { 259 case '&': 260 { 261 static foreach(func; AliasSeq!(parseStdEntityRef, parseCharRef)) 262 {{ 263 immutable c = func(_range); 264 if(!c.isNull) 265 { 266 _begin = 0; 267 _end = _buffer.encode!(UseReplacementDchar.yes)(c.get); 268 return; 269 } 270 }} 271 goto default; 272 } 273 case '\r': 274 { 275 assert(_begin == _end); 276 _range.popFront(); 277 continue; 278 } 279 default: 280 { 281 assert(_begin == _end); 282 return; 283 } 284 } 285 } 286 } 287 288 this(R range) @safe 289 { 290 _range = byCodeUnit(range); 291 _popFrontImpl(); 292 } 293 294 typeof(byCodeUnit(R.init)) _range; 295 static if(is(Unqual!(ElementEncodingType!R) == char)) 296 char[4] _buffer; 297 else static if(is(Unqual!(ElementEncodingType!R) == wchar)) 298 wchar[2] _buffer; 299 else 300 dchar[1] _buffer; 301 size_t _begin; 302 size_t _end; 303 304 public: 305 306 // FIXME A compiler bug prevents this from going with the public declarations 307 // above. If it's there, the compiler thinks that _buffer isn't defined when 308 // it tries to compile front. It needs to be reduced and reported. 309 @property typeof(_buffer[0]) front() { return _begin == _end ? _range.front : _buffer[_begin]; } 310 } 311 312 return DecodedXML(range); 313 } 314 315 /// 316 version(dxmlTests) unittest 317 { 318 assert(decodeXML("hello world &><'" \r\r\r\r\r foo") == 319 `hello world &><'" foo`); 320 321 assert(decodeXML("if(foo && bar)\r\n" ~ 322 " left = right;") == 323 "if(foo && bar)\n" ~ 324 " left = right;"); 325 326 assert(decodeXML("ディラン") == "ディラン"); 327 assert(decodeXML("foo") == "foo"); 328 assert(decodeXML("&# ;") == "&# ;"); 329 330 { 331 import std.algorithm.comparison : equal; 332 auto range = asDecodedXML("hello world &><'" " ~ 333 "\r\r\r\r\r foo"); 334 assert(equal(range, `hello world &><'" foo`)); 335 } 336 337 { 338 import dxml.parser; 339 auto xml = "<root>\n" ~ 340 " <function return='vector<int>' name='foo'>\r\n" ~ 341 " <doc_comment>This function does something really\r\n" ~ 342 " fancy, and you will love it.</doc_comment>\r\n" ~ 343 " <param type='int' name='i'>\r\n" ~ 344 " <param type='const std::string&' name='s'>\r\n" ~ 345 " </function>\n" ~ 346 "</root>"; 347 auto range = parseXML!simpleXML(xml); 348 range.popFront(); 349 assert(range.front.type == EntityType.elementStart); 350 assert(range.front.name == "function"); 351 { 352 auto attrs = range.front.attributes; 353 assert(attrs.front.name == "return"); 354 assert(attrs.front.value == "vector<int>"); 355 assert(decodeXML(attrs.front.value) == "vector<int>"); 356 attrs.popFront(); 357 assert(attrs.front.name == "name"); 358 assert(attrs.front.value == "foo"); 359 assert(decodeXML(attrs.front.value) == "foo"); 360 } 361 range.popFront(); 362 363 assert(range.front.type == EntityType.elementStart); 364 assert(range.front.name == "doc_comment"); 365 range.popFront(); 366 367 assert(range.front.text == 368 "This function does something really\r\n" ~ 369 " fancy, and you will love it."); 370 assert(decodeXML(range.front.text) == 371 "This function does something really\n" ~ 372 " fancy, and you will love it."); 373 range.popFront(); 374 375 assert(range.front.type == EntityType.elementEnd); 376 assert(range.front.name == "doc_comment"); 377 range.popFront(); 378 379 assert(range.front.type == EntityType.elementStart); 380 assert(range.front.name == "param"); 381 { 382 auto attrs = range.front.attributes; 383 assert(attrs.front.name == "type"); 384 assert(attrs.front.value == "int"); 385 assert(decodeXML(attrs.front.value) == "int"); 386 attrs.popFront(); 387 assert(attrs.front.name == "name"); 388 assert(attrs.front.value == "i"); 389 assert(decodeXML(attrs.front.value) == "i"); 390 } 391 range.popFront(); 392 393 assert(range.front.type == EntityType.elementStart); 394 assert(range.front.name == "param"); 395 { 396 auto attrs = range.front.attributes; 397 assert(attrs.front.name == "type"); 398 assert(attrs.front.value == "const std::string&"); 399 assert(decodeXML(attrs.front.value) == "const std::string&"); 400 attrs.popFront(); 401 assert(attrs.front.name == "name"); 402 assert(attrs.front.value == "s"); 403 assert(decodeXML(attrs.front.value) == "s"); 404 } 405 } 406 } 407 408 version(dxmlTests) unittest 409 { 410 import core.exception : AssertError; 411 import std.algorithm.comparison : equal; 412 import std.exception : enforce; 413 import std.utf : byUTF; 414 import dxml.internal : testRangeFuncs; 415 416 static void test(alias func)(string text, string expected, size_t line = __LINE__) 417 { 418 auto range = func(text); 419 enforce!AssertError(range.save.decodeXML() == expected, "unittest failed 1", __FILE__, line); 420 alias C = ElementType!(typeof(range.save.asDecodedXML())); 421 enforce!AssertError(equal(range.save.asDecodedXML(), expected.byUTF!C), "unittest failed 2", __FILE__, line); 422 } 423 424 static foreach(func; testRangeFuncs) 425 {{ 426 test!func("hello world & > < ' " \r\r\r\r\r foo", `hello world & > < ' " foo`); 427 test!func("&", "&"); 428 test!func("�", "�"); 429 test!func("&", "&"); 430 test!func("&&&&", "&&&&"); 431 test!func("&&&&", "&&&&"); 432 test!func("&#", "&#"); 433 test!func("&#;", "&#;"); 434 test!func("�", "�"); 435 test!func("�", "�"); 436 test!func("0", "0"); 437 test!func("�amp;", "�amp;"); 438 test!func("&#amp;", "&#amp;"); 439 test!func("&#x", "&#x"); 440 test!func("&#x;", "&#x;"); 441 test!func("�", "�"); 442 test!func("	", "\t"); 443 test!func(" ", " "); 444 test!func("ディラン", "ディラン"); 445 }} 446 } 447 448 version(dxmlTests) @safe pure unittest 449 { 450 import std.algorithm.comparison : equal; 451 import dxml.internal : testRangeFuncs; 452 453 static foreach(func; testRangeFuncs) 454 {{ 455 assert(decodeXML(func("foo")) == "foo"); 456 assert(equal(asDecodedXML(func("foo")), "foo")); 457 }} 458 } 459 460 461 /++ 462 This parses one of the five, predefined entity references mention in the XML 463 spec from the front of a range of characters. 464 465 If the given range starts with one of the five, predefined entity 466 references, then it is removed from the range, and the corresponding 467 character is returned. 468 469 If the range does not start with one of those references, then the return 470 value is null, and the range is unchanged. 471 472 $(TABLE 473 $(TR $(TH Std Entity Ref)$(TH Converts To)) 474 $(TR $(TD $(D_CODE_STRING $(AMP)amp;))$(TD $(D_CODE_STRING &))) 475 $(TR $(TD $(D_CODE_STRING $(AMP)gt;))$(TD $(D_CODE_STRING >))) 476 $(TR $(TD $(D_CODE_STRING $(AMP)lt;))$(TD $(D_CODE_STRING $(LT)))) 477 $(TR $(TD $(D_CODE_STRING $(AMP)apos;))$(TD $(D_CODE_STRING '))) 478 $(TR $(TD $(D_CODE_STRING $(AMP)quot;))$(TD $(D_CODE_STRING "))) 479 ) 480 481 Any other entity references would require processing a DTD section in order 482 to be handled and are untouched by parseStdEntityRef as are any other types 483 of references. 484 485 Params: 486 range = A range of characters. 487 488 Returns: The character represented by the predefined entity reference that 489 was parsed from the front of the given range or null if the range 490 did not start with one of the five predefined entity references. 491 492 See_Also: $(LINK http://www.w3.org/TR/REC-xml/#dt-chardata)$(BR) 493 $(LREF parseCharRef)$(BR) 494 $(LREF decodeXML)$(BR) 495 $(LREF asDecodedXML) 496 +/ 497 Nullable!dchar parseStdEntityRef(R)(ref R range) 498 if(isForwardRange!R && isSomeChar!(ElementType!R)) 499 { 500 import std.algorithm.searching : startsWith; 501 import std.typecons : nullable, tuple; 502 import std.utf : byCodeUnit; 503 504 auto orig = range.save; 505 506 static if(isNarrowString!R) 507 auto cuRange = range.byCodeUnit(); 508 else 509 alias cuRange = range; 510 511 if(!cuRange.save.startsWith('&')) 512 return typeof(return).init; 513 cuRange.popFront(); 514 515 if(cuRange.empty) 516 goto invalid; 517 518 static foreach(t; [tuple("amp;", '&'), tuple("gt;", '>'), tuple("lt;", '<'), 519 tuple("apos;", '\''), tuple("quot;", '"')]) 520 { 521 if(cuRange.save.startsWith(t[0])) 522 { 523 cuRange.popFrontN(t[0].length); 524 static if(isNarrowString!R) 525 range = cuRange.source; 526 return nullable(cast(dchar)t[1]); 527 } 528 } 529 530 invalid: range = orig; 531 return typeof(return).init; 532 } 533 534 /// 535 version(dxmlTests) unittest 536 { 537 { 538 auto range = "&foo"; 539 assert(range.parseStdEntityRef() == '&'); 540 assert(range == "foo"); 541 } 542 { 543 auto range = ">bar"; 544 assert(range.parseStdEntityRef() == '>'); 545 assert(range == "bar"); 546 } 547 { 548 auto range = "<baz"; 549 assert(range.parseStdEntityRef() == '<'); 550 assert(range == "baz"); 551 } 552 { 553 auto range = "'dlang"; 554 assert(range.parseStdEntityRef() == '\''); 555 assert(range == "dlang"); 556 } 557 { 558 auto range = ""rocks"; 559 assert(range.parseStdEntityRef() == '"'); 560 assert(range == "rocks"); 561 } 562 { 563 auto range = " &foo"; 564 assert(range.parseStdEntityRef().isNull); 565 assert(range == " &foo"); 566 } 567 { 568 auto range = "&Amp;hello"; 569 assert(range.parseStdEntityRef().isNull); 570 assert(range == "&Amp;hello"); 571 } 572 { 573 auto range = " foo"; 574 assert(range.parseStdEntityRef().isNull); 575 assert(range == " foo"); 576 } 577 { 578 auto range = "hello world"; 579 assert(range.parseStdEntityRef().isNull); 580 assert(range == "hello world"); 581 } 582 } 583 584 version(dxmlTests) unittest 585 { 586 import std.algorithm.comparison : equal; 587 import dxml.internal : testRangeFuncs; 588 589 static foreach(func; testRangeFuncs) 590 { 591 for(auto range = func(";Amp;amp;&#amp;©& amp;"); !range.empty; range.popFront()) 592 { 593 auto temp = range.save; 594 assert(temp.parseStdEntityRef().isNull); 595 assert(equal(range.save, temp.save)); 596 } 597 { 598 auto range = func("&"); 599 assert(range.parseStdEntityRef().isNull); 600 assert(equal(range.save, "&")); 601 } 602 { 603 auto range = func(" &><'""); 604 assert(range.parseStdEntityRef().isNull); 605 assert(equal(range.save, " &><'"")); 606 range.popFront(); 607 608 assert(range.parseStdEntityRef() == '&'); 609 assert(equal(range.save, "><'"")); 610 assert(range.parseStdEntityRef() == '>'); 611 assert(equal(range.save, "<'"")); 612 assert(range.parseStdEntityRef() == '<'); 613 assert(equal(range.save, "'"")); 614 assert(range.parseStdEntityRef() == '\''); 615 assert(equal(range.save, """)); 616 assert(range.parseStdEntityRef() == '"'); 617 assert(range.empty); 618 } 619 } 620 } 621 622 version(dxmlTests) @safe pure unittest 623 { 624 import dxml.internal : testRangeFuncs; 625 626 static foreach(func; testRangeFuncs) 627 {{ 628 auto range = func("foo"); 629 assert(range.parseStdEntityRef().isNull); 630 }} 631 } 632 633 634 /++ 635 If the given range starts with a valid, XML, character reference, it is 636 removed from the range, and the corresponding character is returned. 637 638 If the range does not start with a valid, XML, character reference, then 639 the return value is null, and the range is unchanged. 640 641 Params: 642 range = A range of characters. 643 644 Returns: The character represented by the character reference that was 645 parsed from the front of the given range or null if the range did 646 not start with a valid, XML, character reference. 647 648 See_Also: $(LINK http://www.w3.org/TR/REC-xml/#NT-CharRef)$(BR) 649 $(LREF parseStdEntityRef)$(BR) 650 $(LREF decodeXML)$(BR) 651 $(LREF asDecodedXML)$(BR) 652 $(LREF encodeCharRef) 653 +/ 654 Nullable!dchar parseCharRef(R)(ref R range) 655 if(isForwardRange!R && isSomeChar!(ElementType!R)) 656 { 657 import std.algorithm.searching : startsWith; 658 import std.conv : ConvException, parse, to; 659 import std.range : popFrontN; 660 import std.typecons : nullable; 661 import std.utf : byCodeUnit; 662 import dxml.internal : isXMLChar; 663 664 auto orig = range.save; 665 666 static if(isNarrowString!R) 667 auto cuRange = range.byCodeUnit(); 668 else 669 alias cuRange = range; 670 671 if(!cuRange.save.startsWith("&#")) 672 return typeof(return).init; 673 cuRange.popFrontN(2); 674 675 if(cuRange.empty) 676 goto invalid; 677 678 { 679 bool hex = false; 680 if(cuRange.front == 'x') 681 { 682 cuRange.popFront(); 683 hex = true; 684 // https://issues.dlang.org/show_bug.cgi?id=18248 685 import std.ascii : isHexDigit; 686 if(cuRange.empty || !isHexDigit(cuRange.front)) 687 goto invalid; 688 } 689 try 690 { 691 immutable c = to!dchar(cuRange.parse!uint(hex ? 16 : 10)); 692 if(!cuRange.startsWith(';') || (c != '\n' && !isXMLChar(c))) 693 goto invalid; 694 cuRange.popFront(); 695 static if(isNarrowString!R) 696 range = cuRange.source; 697 return nullable(cast()c); 698 } 699 catch(ConvException) 700 {} 701 } 702 703 invalid: range = orig; 704 return typeof(return).init; 705 } 706 707 /// 708 version(dxmlTests) unittest 709 { 710 import std.range.primitives : empty; 711 712 { 713 auto range = "0 hello world"; 714 assert(parseCharRef(range) == '0'); 715 assert(range == " hello world"); 716 } 717 { 718 auto range = "0 hello world"; 719 assert(parseCharRef(range) == '0'); 720 assert(range == " hello world"); 721 } 722 { 723 auto range = "ディラン"; 724 assert(parseCharRef(range) == 'デ'); 725 assert(range == "ィラン"); 726 assert(parseCharRef(range) == 'ィ'); 727 assert(range == "ラン"); 728 assert(parseCharRef(range) == 'ラ'); 729 assert(range == "ン"); 730 assert(parseCharRef(range) == 'ン'); 731 assert(range.empty); 732 } 733 { 734 auto range = "&#x;foo"; 735 assert(parseCharRef(range).isNull); 736 assert(range == "&#x;foo"); 737 } 738 { 739 auto range = "foobar"; 740 assert(parseCharRef(range).isNull); 741 assert(range == "foobar"); 742 } 743 { 744 auto range = " &x48;"; 745 assert(parseCharRef(range).isNull); 746 assert(range == " &x48;"); 747 } 748 } 749 750 version(dxmlTests) unittest 751 { 752 import std.algorithm.comparison : equal; 753 import dxml.internal : testRangeFuncs; 754 755 static foreach(func; testRangeFuncs) 756 { 757 for(auto range = func(";;&#;&#G;�&#F;"); !range.empty; range.popFront()) 758 { 759 auto temp = range.save; 760 assert(temp.parseCharRef().isNull); 761 assert(equal(range.save, temp.save)); 762 } 763 { 764 auto range = func("A"); 765 assert(range.parseCharRef().isNull); 766 assert(equal(range.save, "A")); 767 } 768 { 769 auto range = func(" ABC 京都市"); 770 assert(range.parseCharRef().isNull); 771 assert(equal(range.save, " ABC 京都市")); 772 range.popFront(); 773 774 assert(range.parseCharRef() == 'A'); 775 assert(equal(range.save, "BC 京都市")); 776 assert(range.parseCharRef() == 'B'); 777 assert(equal(range.save, "C 京都市")); 778 assert(range.parseCharRef() == 'C'); 779 assert(equal(range.save, " 京都市")); 780 781 assert(range.parseCharRef().isNull); 782 assert(equal(range.save, " 京都市")); 783 range.popFront(); 784 785 assert(range.parseCharRef() == '京'); 786 assert(equal(range.save, "都市")); 787 assert(range.parseCharRef() == '都'); 788 assert(equal(range.save, "市")); 789 assert(range.parseCharRef() == '市'); 790 assert(range.empty); 791 } 792 } 793 } 794 795 version(dxmlTests) @safe pure unittest 796 { 797 import dxml.internal : testRangeFuncs; 798 799 static foreach(func; testRangeFuncs) 800 {{ 801 auto range = func("foo"); 802 assert(range.parseCharRef().isNull); 803 }} 804 } 805 806 807 /++ 808 Strips the indent from a character range (most likely from 809 $(REF_ALTTEXT Entity.text, EntityRange.Entity.text, dxml, parser)). 810 The idea is that if the XML is formatted to be human-readable, and it's 811 multiple lines long, the lines are likely to be indented, but the 812 application probably doesn't want that extra whitespace. So, stripIndent 813 and withoutIndent attempt to intelligently strip off the leading 814 whitespace. 815 816 For these functions, whitespace is considered to be some combination of 817 $(D_CODE_STRING ' '), $(D_CODE_STRING '\t'), and $(D_CODE_STRING '\r') 818 ($(D_CODE_STRING '\n') is used to delineate lines, so it's not considered 819 whitespace). 820 821 Whitespace characters are stripped from the start of the first line, and 822 then those same number of whitespace characters are stripped from the 823 beginning of each subsequent line (or up to the first non-whitespace 824 character if the line starts with fewer whitespace characters). 825 826 If the first line has no leading whitespace, then the leading whitespace on 827 the second line is treated as the indent. This is done to handle case where 828 there is text immediately after a start tag and then subsequent lines are 829 indented rather than the text starting on the line after the start tag. 830 831 If neither of the first two lines has any leading whitespace, then no 832 whitespace is stripped. 833 834 So, if the text is well-formatted, then the indent should be cleanly 835 removed, and if it's unformatted or badly formatted, then no characters 836 other than leading whitespace will be removed, and in principle, no real 837 data will have been lost - though of course, it's up to the programmer to 838 decide whether it's better for the application to try to cleanly strip the 839 indent or to leave the text as-is. 840 841 The difference between stripIndent and withoutIndent is that stripIndent 842 returns a $(K_STRING), whereas withoutIndent returns a lazy range 843 of code units. In the case where a $(K_STRING) is passed to 844 stripIndent, it will simply return the original string if there is no 845 indent (whereas in other cases, stripIndent and withoutIndent are forced to 846 return new ranges). 847 848 Params: 849 range = A range of characters. 850 851 Returns: The text with the indent stripped from each line. stripIndent 852 returns a $(K_STRING), whereas withoutIndent returns a lazy range 853 of code units (so it could be a range of $(K_CHAR) or $(K_WCHAR) 854 and not just $(K_DCHAR); which it is depends on the code units of 855 the range being passed in). 856 857 See_Also: $(REF EntityRange.Entity.text, dxml, parser) 858 +/ 859 string stripIndent(R)(R range) 860 if(isForwardRange!R && isSomeChar!(ElementType!R)) 861 { 862 import std.conv : to; 863 864 static if(isDynamicArray!R && is(Unqual!(ElementEncodingType!R) == char)) 865 { 866 static bool notHWhite(char c) 867 { 868 switch(c) 869 { 870 case ' ': 871 case '\t': 872 case '\r': return false; 873 default : return true; 874 } 875 } 876 877 import std.algorithm.searching : find; 878 import std.utf : byCodeUnit; 879 880 if(range.empty) 881 return range.to!string(); 882 883 auto orig = range.save; 884 auto text = range.byCodeUnit(); 885 ElementEncodingType!R[] firstLine; 886 887 if(notHWhite(text.front)) 888 { 889 text = text.find('\n'); 890 if(text.empty) 891 return orig.to!string(); 892 text.popFront(); 893 firstLine = orig[0 .. orig.length - text.length]; 894 } 895 896 immutable beforeIndent = text.length; 897 text = text.find!notHWhite(); 898 if(text.empty) 899 return firstLine.empty ? "" : firstLine[0 .. $ - 1].to!string(); 900 immutable indent = beforeIndent - text.length; 901 902 if(indent == 0) 903 return orig.to!string(); 904 905 import std.array : appender; 906 auto retval = appender!string(); 907 retval.reserve(orig.length / 3); 908 909 // > 1 because we don't want a newline by itself. 910 if(firstLine.length > 1) 911 put(retval, firstLine); 912 913 outer: while(true) 914 { 915 auto start = text.save; 916 text = text.find('\n'); 917 if(text.empty) 918 { 919 if(!start.empty) 920 put(retval, start); 921 return retval.data; 922 } 923 text.popFront(); 924 auto line = start[0 .. $ - text.length]; 925 foreach(_; 0 .. indent) 926 { 927 if(text.empty) 928 goto isEmpty; 929 if(notHWhite(text.front)) 930 goto notEmpty; 931 text.popFront(); 932 } 933 if(text.empty) 934 { 935 isEmpty: put(retval, line[0 .. $ - 1]); 936 return retval.data; 937 } 938 notEmpty: put(retval, line); 939 } 940 // The compiler is not smart enough to realize that this line is unreachable. 941 assert(0); 942 } 943 else 944 return range.withoutIndent().to!string(); 945 } 946 947 /// Ditto 948 auto withoutIndent(R)(R range) 949 if(isForwardRange!R && isSomeChar!(ElementType!R)) 950 { 951 import std.utf : byCodeUnit; 952 953 static struct WithoutIndent 954 { 955 public: 956 957 @property empty() { return _line.empty; } 958 959 @property front() { return _line.front; } 960 961 void popFront() 962 { 963 if(_indent == 0) 964 { 965 _line.popFront(); 966 return; 967 } 968 969 if(_line.front == '\n') 970 _nextLine(); 971 else 972 _line.popFront(); 973 // Skip last newline 974 if(_range.empty && !_line.empty && _line.front == '\n') 975 _line = _range; 976 } 977 978 @property save() 979 { 980 auto retval = this; 981 retval._line = _line.save; 982 retval._range = _range.save; 983 return retval; 984 } 985 986 private: 987 988 static bool notHWhite(ElementEncodingType!R c) 989 { 990 switch(c) 991 { 992 case ' ': 993 case '\t': 994 case '\r': return false; 995 default : return true; 996 } 997 } 998 999 void _nextLine() 1000 { 1001 import std.algorithm.searching : find; 1002 _line = _range.save; 1003 _range = _range.find('\n'); 1004 if(_range.empty) 1005 return; 1006 _range.popFront(); 1007 _popIndent(); 1008 } 1009 1010 void _popIndent() 1011 { 1012 foreach(_; 0 .. _indent) 1013 { 1014 if(_range.empty) 1015 return; 1016 if(notHWhite(_range.front)) 1017 return; 1018 _range.popFront(); 1019 } 1020 } 1021 1022 this(R range) 1023 { 1024 import std.algorithm : countUntil, find; 1025 import std.range : popFrontN; 1026 1027 _range = byCodeUnit(range); 1028 if(_range.empty) 1029 { 1030 _line = _range; 1031 return; 1032 } 1033 1034 auto orig = _range.save; 1035 immutable noFirstIndent = notHWhite(_range.front); 1036 if(noFirstIndent) 1037 { 1038 _range = _range.find('\n'); 1039 if(_range.empty) 1040 goto noIndent; 1041 _range.popFront(); 1042 } 1043 1044 _indent = _range.save.countUntil!(a => notHWhite(a))(); 1045 if(_indent == 0) 1046 { 1047 noIndent: _line = orig; 1048 return; 1049 } 1050 if(noFirstIndent && orig.front != '\n') 1051 { 1052 _range = orig; 1053 _popIndent(); 1054 } 1055 else 1056 _range.popFrontN(_indent); 1057 _nextLine(); 1058 } 1059 1060 typeof(byCodeUnit(R.init)) _range; 1061 typeof(byCodeUnit(R.init)) _line; 1062 size_t _indent; 1063 } 1064 1065 return WithoutIndent(range); 1066 } 1067 1068 /// 1069 version(dxmlTests) unittest 1070 { 1071 import std.algorithm.comparison : equal; 1072 1073 // The prime use case for these two functions is for an Entity.text section 1074 // that is formatted to be human-readable, and the rules of what whitespace 1075 // is stripped from the beginning or end of the range are geared towards 1076 // the text coming from a well-formatted Entity.text section. 1077 { 1078 import dxml.parser; 1079 auto xml = "<root>\n" ~ 1080 " <code>\n" ~ 1081 " bool isASCII(string str)\n" ~ 1082 " {\n" ~ 1083 " import std.algorithm : all;\n" ~ 1084 " import std.ascii : isASCII;\n" ~ 1085 " return str.all!isASCII();\n" ~ 1086 " }\n" ~ 1087 " </code>\n" ~ 1088 "<root>"; 1089 auto range = parseXML(xml); 1090 range.popFront(); 1091 range.popFront(); 1092 assert(range.front.type == EntityType.text); 1093 assert(range.front.text == 1094 "\n" ~ 1095 " bool isASCII(string str)\n" ~ 1096 " {\n" ~ 1097 " import std.algorithm : all;\n" ~ 1098 " import std.ascii : isASCII;\n" ~ 1099 " return str.all!isASCII();\n" ~ 1100 " }\n" ~ 1101 " "); 1102 assert(range.front.text.stripIndent() == 1103 "bool isASCII(string str)\n" ~ 1104 "{\n" ~ 1105 " import std.algorithm : all;\n" ~ 1106 " import std.ascii : isASCII;\n" ~ 1107 " return str.all!isASCII();\n" ~ 1108 "}"); 1109 } 1110 1111 // The indent that is stripped matches the amount of whitespace at the front 1112 // of the first line. 1113 assert((" start\n" ~ 1114 " foo\n" ~ 1115 " bar\n" ~ 1116 " baz\n" ~ 1117 " xyzzy\n" ~ 1118 " ").stripIndent() == 1119 "start\n" ~ 1120 "foo\n" ~ 1121 "bar\n" ~ 1122 " baz\n" ~ 1123 " xyzzy\n" ~ 1124 " "); 1125 1126 // If the first line has no leading whitespace but the second line does, 1127 // then the second line's leading whitespace is treated as the indent. 1128 assert(("foo\n" ~ 1129 " bar\n" ~ 1130 " baz\n" ~ 1131 " xyzzy").stripIndent() == 1132 "foo\n" ~ 1133 "bar\n" ~ 1134 " baz\n" ~ 1135 " xyzzy"); 1136 1137 assert(("\n" ~ 1138 " foo\n" ~ 1139 " bar\n" ~ 1140 " baz\n" ~ 1141 " xyzzy").stripIndent() == 1142 "foo\n" ~ 1143 "bar\n" ~ 1144 " baz\n" ~ 1145 " xyzzy"); 1146 1147 // If neither of the first two lines has leading whitespace, then nothing 1148 // is stripped. 1149 assert(("foo\n" ~ 1150 "bar\n" ~ 1151 " baz\n" ~ 1152 " xyzzy\n" ~ 1153 " ").stripIndent() == 1154 "foo\n" ~ 1155 "bar\n" ~ 1156 " baz\n" ~ 1157 " xyzzy\n" ~ 1158 " "); 1159 1160 // If a subsequent line starts with less whitespace than the indent, then 1161 // all of its leading whitespace is stripped but no other characters are 1162 // stripped. 1163 assert((" foo\n" ~ 1164 " bar\n" ~ 1165 " baz\n" ~ 1166 " xyzzy").stripIndent() == 1167 "foo\n" ~ 1168 " bar\n" ~ 1169 "baz\n" ~ 1170 " xyzzy"); 1171 1172 // If the last line is just the indent, then it and the newline before it 1173 // are stripped. 1174 assert((" foo\n" ~ 1175 " bar\n" ~ 1176 " ").stripIndent() == 1177 "foo\n" ~ 1178 " bar"); 1179 1180 // If the last line is just whitespace, but it's more than the indent, then 1181 // the whitespace after the indent is kept. 1182 assert((" foo\n" ~ 1183 " bar\n" ~ 1184 " ").stripIndent() == 1185 "foo\n" ~ 1186 " bar\n" ~ 1187 " "); 1188 1189 // withoutIndent does the same as stripIndent but with a lazy range. 1190 assert(equal((" foo\n" ~ 1191 " bar\n" ~ 1192 " baz\n").withoutIndent(), 1193 "foo\n" ~ 1194 " bar\n" ~ 1195 " baz")); 1196 } 1197 1198 version(dxmlTests) unittest 1199 { 1200 import core.exception : AssertError; 1201 import std.algorithm.comparison : equal; 1202 import std.exception : enforce; 1203 import std.utf : byUTF; 1204 import dxml.internal : testRangeFuncs; 1205 1206 static void test(alias func)(string text, string expected, size_t line = __LINE__) 1207 { 1208 auto range = func(text); 1209 enforce!AssertError(range.save.stripIndent() == expected, "unittest failed 1", __FILE__, line); 1210 alias C = ElementType!(typeof(range.save.withoutIndent())); 1211 enforce!AssertError(equal(range.save.withoutIndent(), expected.byUTF!C), "unittest failed 2", __FILE__, line); 1212 } 1213 1214 static foreach(func; testRangeFuncs) 1215 { 1216 test!func("", ""); 1217 test!func(" ", ""); 1218 test!func("foo", "foo"); 1219 test!func("\nfoo", "\nfoo"); 1220 test!func(" foo", "foo"); 1221 test!func("\n foo", "foo"); 1222 test!func("\n foo\n", "foo"); 1223 test!func("\n foo\n ", "foo"); 1224 test!func("\n foo\n ", "foo\n "); 1225 test!func("foo\n ", "foo"); 1226 1227 test!func(" foo\n bar \n baz", "foo\nbar \n baz"); 1228 test!func(" foo\nbar\n baz", "foo\nbar\nbaz"); 1229 test!func(" foo\n bar\n baz", "foo\nbar\nbaz"); 1230 test!func(" foo\n bar\n baz", "foo\nbar\nbaz"); 1231 test!func(" foo\n bar\n baz", "foo\n bar\nbaz"); 1232 test!func(" foo\n bar\n baz", "foo\n bar\nbaz"); 1233 test!func(" foo\n bar\n baz", "foo\n bar\nbaz"); 1234 test!func(" foo\n bar\n baz\n\n\n\n\n", "foo\n bar\nbaz\n\n\n\n"); 1235 1236 test!func(" foo\n bar\n baz", "foo\nbar\n baz"); 1237 1238 test!func("foo\n bar\n baz", "foo\nbar\n baz"); 1239 test!func("foo\nbar\n baz\n", "foo\nbar\n baz\n"); 1240 } 1241 } 1242 1243 version(dxmlTests) @safe pure unittest 1244 { 1245 import std.algorithm.comparison : equal; 1246 import dxml.internal : testRangeFuncs; 1247 1248 static foreach(func; testRangeFuncs) 1249 {{ 1250 assert(stripIndent(func("foo")) == "foo"); 1251 assert(equal(withoutIndent(func("foo")), "foo")); 1252 }} 1253 } 1254 1255 1256 /++ 1257 The string representations of the five, entity references predefined by the 1258 XML spec. 1259 1260 See_Also: $(LINK http://www.w3.org/TR/REC-xml/#dt-chardata)$(BR) 1261 $(LREF parseStdEntityRef) 1262 +/ 1263 enum StdEntityRef 1264 { 1265 /// Entity reference for $(D_CODE_STRING $(AMP)) 1266 amp = "&", 1267 1268 /// Entity reference for $(D_CODE_STRING >) 1269 gt = ">", 1270 1271 /// Entity reference for $(D_CODE_STRING <) 1272 lt = "<", 1273 1274 /// Entity reference for $(D_CODE_STRING ') 1275 apos = "'", 1276 1277 /// Entity reference for $(D_CODE_STRING ") 1278 quot = """, 1279 } 1280 1281 1282 /++ 1283 Returns a lazy range of code units which encodes any characters which cannot 1284 be put in an $(REF EntityType._text, dxml, parser) in their literal form. 1285 1286 encodeText is intended primarily to be used with 1287 $(REF XMLWriter.writeText, dxml, writer) to ensure that characters which 1288 cannot appear in their literal form do not appear in their literal form. 1289 1290 Specifically, what encodeText does is 1291 1292 $(TABLE 1293 $(TR $(TD convert $(D_CODE_STRING &) to $(D_CODE_STRING $(AMP)amp;) )) 1294 $(TR $(TD convert $(D_CODE_STRING <) to $(D_CODE_STRING $(AMP)lt;) )) 1295 $(TR $(TD convert $(D_CODE_STRING ]]>) to $(D_CODE_STRING ]]$(AMP)gt;) )) 1296 ) 1297 1298 See_Also: $(REF XMLWriter.writeText, dxml, writer)$(BR) 1299 $(LREF encodeAttr)$(BR) 1300 $(LREF decodeXML)$(BR) 1301 $(LREF asDecodedXML) 1302 +/ 1303 auto encodeText(R)(R text) 1304 if(isForwardRange!R && isSomeChar!(ElementType!R)) 1305 { 1306 import std.utf : byCodeUnit; 1307 1308 static struct EncodeText 1309 { 1310 public: 1311 1312 @property front() { return _len == 0 ? _text.front : cast(ElementEncodingType!R)_buffer[_len - 1]; } 1313 1314 @property empty() { return _text.empty; } 1315 1316 void popFront() 1317 { 1318 if(_len != 0) 1319 { 1320 if(--_len != 0) 1321 return; 1322 } 1323 _text.popFront(); 1324 _handleEntity(); 1325 } 1326 1327 @property save() 1328 { 1329 auto retval = this; 1330 retval._text = _text.save; 1331 return retval; 1332 } 1333 1334 private: 1335 1336 void _handleEntity() 1337 { 1338 if(_text.empty) 1339 return; 1340 switch(_text.front) 1341 { 1342 case '&': 1343 { 1344 enum entity = ";pma&"; 1345 _buffer = entity; 1346 _len = entity.length; 1347 return; 1348 } 1349 case '<': 1350 { 1351 enum entity = ";tl&"; 1352 _buffer = entity; 1353 _len = entity.length; 1354 return; 1355 } 1356 case ']': 1357 { 1358 import std.range : dropOne; 1359 1360 // FIXME This should use startsWith, but for some reason, 1361 // startsWith doesn't currently work with @nogc or nothrow 1362 // even when this code should be able to be @nogc and/or 1363 // nothrow. 1364 auto temp = _text.save.dropOne(); 1365 if(!temp.empty && temp.front == ']') 1366 { 1367 temp.popFront(); 1368 if(!temp.empty && temp.front == '>') 1369 { 1370 _text = temp; 1371 enum entity = ";tg&]]"; 1372 _buffer = entity; 1373 _len = entity.length; 1374 } 1375 } 1376 return; 1377 } 1378 default: return; 1379 } 1380 } 1381 1382 this(R text) 1383 { 1384 _text = byCodeUnit(text); 1385 _handleEntity(); 1386 } 1387 1388 char["]]>".length] _buffer; 1389 size_t _len; 1390 typeof(byCodeUnit(R.init)) _text; 1391 } 1392 1393 return EncodeText(text); 1394 } 1395 1396 /// 1397 version(dxmlTests) @safe pure nothrow @nogc unittest 1398 { 1399 import std.algorithm.comparison : equal; 1400 1401 assert(equal(encodeText(`foo & bar`), `foo & bar`)); 1402 assert(equal(encodeText(`foo < bar`), `foo < bar`)); 1403 assert(equal(encodeText(`foo > bar`), `foo > bar`)); 1404 assert(equal(encodeText(`foo ' bar`), `foo ' bar`)); 1405 assert(equal(encodeText(`foo " bar`), `foo " bar`)); 1406 assert(equal(encodeText("foo ]]> bar"), "foo ]]> bar")); 1407 1408 assert(equal(encodeText("hello world"), "hello world")); 1409 } 1410 1411 version(dxmlTests) @safe pure unittest 1412 { 1413 import std.algorithm.comparison : equal; 1414 import dxml.internal : testRangeFuncs; 1415 1416 static foreach(func; testRangeFuncs) 1417 {{ 1418 assert(encodeText(func("")).empty); 1419 assert(equal(encodeText(func(`& < > ' "`)), `& < > ' "`)); 1420 assert(equal(encodeText(func("&&&")), "&&&")); 1421 1422 auto range = encodeText(func(`&&<<>>''""hello ] ]> world"">><<&&`)); 1423 assert(equal(range.save, range.save)); 1424 assert(equal(range.save, `&&<<>>''""hello ] ]> world"">><<&&`)); 1425 }} 1426 } 1427 1428 1429 /++ 1430 Returns a lazy range of code units which encodes any characters which cannot 1431 be put in an attribute value of an element tag in their literal form. 1432 1433 encodeAttr is intended primarily to be used with 1434 $(REF XMLWriter.writeAttr, dxml, writer) to ensure that characters 1435 which cannot appear in their literal form do not appear in their literal 1436 form. 1437 1438 Specifically, what encodeAttr does is 1439 1440 $(TABLE 1441 $(TR $(TD convert $(D_CODE_STRING &) to $(D_CODE_STRING $(AMP)amp;) )) 1442 $(TR $(TD convert $(D_CODE_STRING <) to $(D_CODE_STRING $(AMP)lt;) )) 1443 $(TR $(TD convert $(D_CODE_STRING ') to $(D_CODE_STRING $(AMP)pos;) if 1444 $(D quote == $(D_STRING '\'')))) 1445 $(TR $(TD convert $(D_CODE_STRING ") to $(D_CODE_STRING $(AMP)quot;) if 1446 $(D quote == $(D_STRING '"')))) 1447 ) 1448 1449 See_Also: $(REF XMLWriter.writeAttr, dxml, writer)$(BR) 1450 $(LREF encodeText)$(BR) 1451 $(LREF decodeXML)$(BR) 1452 $(LREF asDecodedXML) 1453 +/ 1454 auto encodeAttr(char quote = '"', R)(R text) 1455 if((quote == '"' || quote == '\'') && isForwardRange!R && isSomeChar!(ElementType!R)) 1456 { 1457 import std.utf : byCodeUnit; 1458 1459 static struct EncodeAttr 1460 { 1461 public: 1462 1463 @property front() { return _len == 0 ? _text.front : cast(ElementEncodingType!R)_buffer[_len - 1]; } 1464 1465 @property empty() { return _text.empty; } 1466 1467 void popFront() 1468 { 1469 if(_len != 0) 1470 { 1471 if(--_len != 0) 1472 return; 1473 } 1474 _text.popFront(); 1475 _handleEntity(); 1476 } 1477 1478 @property save() 1479 { 1480 auto retval = this; 1481 retval._text = _text.save; 1482 return retval; 1483 } 1484 1485 private: 1486 1487 void _handleEntity() 1488 { 1489 if(_text.empty) 1490 return; 1491 switch(_text.front) 1492 { 1493 case '&': 1494 { 1495 enum entity = ";pma&"; 1496 _buffer = entity; 1497 _len = entity.length; 1498 return; 1499 } 1500 case '<': 1501 { 1502 enum entity = ";tl&"; 1503 _buffer = entity; 1504 _len = entity.length; 1505 return; 1506 } 1507 case quote: 1508 { 1509 static if(quote == '"') 1510 enum entity = ";touq&"; 1511 else 1512 enum entity = ";sopa&"; 1513 _buffer = entity; 1514 _len = entity.length; 1515 return; 1516 } 1517 default: return; 1518 } 1519 } 1520 1521 this(R text) 1522 { 1523 _text = byCodeUnit(text); 1524 _handleEntity(); 1525 } 1526 1527 char[""".length] _buffer; 1528 size_t _len; 1529 typeof(byCodeUnit(R.init)) _text; 1530 } 1531 1532 return EncodeAttr(text); 1533 } 1534 1535 /// 1536 version(dxmlTests) @safe pure nothrow @nogc unittest 1537 { 1538 import std.algorithm.comparison : equal; 1539 1540 assert(equal(encodeAttr(`foo & bar`), `foo & bar`)); 1541 assert(equal(encodeAttr(`foo < bar`), `foo < bar`)); 1542 assert(equal(encodeAttr(`foo > bar`), `foo > bar`)); 1543 assert(equal(encodeAttr(`foo ' bar`), `foo ' bar`)); 1544 assert(equal(encodeAttr(`foo " bar`), `foo " bar`)); 1545 1546 assert(equal(encodeAttr!'\''(`foo ' bar`), `foo ' bar`)); 1547 assert(equal(encodeAttr!'\''(`foo " bar`), `foo " bar`)); 1548 1549 assert(equal(encodeAttr("hello world"), "hello world")); 1550 } 1551 1552 version(dxmlTests) @safe pure unittest 1553 { 1554 import std.algorithm.comparison : equal; 1555 import dxml.internal : testRangeFuncs; 1556 1557 static foreach(func; testRangeFuncs) 1558 {{ 1559 assert(encodeAttr(func("")).empty); 1560 assert(encodeAttr!'\''(func("")).empty); 1561 assert(equal(encodeAttr(func(`& < > ' "`)), `& < > ' "`)); 1562 assert(equal(encodeAttr!'\''(func(`& < > ' "`)), `& < > ' "`)); 1563 assert(equal(encodeAttr(func("&&&")), "&&&")); 1564 1565 { 1566 auto range = encodeAttr(func(`&&<<>>''""hello world"">><<&&`)); 1567 assert(equal(range.save, range.save)); 1568 assert(equal(range.save, `&&<<>>''""hello world"">><<&&`)); 1569 } 1570 1571 { 1572 auto range = encodeAttr!'\''(func(`&&<<>>''""hello world"">><<&&`)); 1573 assert(equal(range.save, range.save)); 1574 assert(equal(range.save, `&&<<>>''""hello world"">><<&&`)); 1575 } 1576 }} 1577 } 1578 1579 1580 /++ 1581 Returns a range of $(K_CHAR) containing the character reference 1582 corresponding to the given character. 1583 1584 Params: 1585 c = The character to encode. 1586 1587 See_Also: $(LREF parseCharRef) 1588 +/ 1589 auto encodeCharRef(dchar c) 1590 { 1591 static struct EncodeCharRef 1592 { 1593 public: 1594 1595 @property front() { return _buffer[_index]; } 1596 1597 @property empty() { return _buffer[_index] == '$'; } 1598 1599 void popFront() { ++_index; } 1600 1601 @property save() { return this; } 1602 1603 private: 1604 1605 import std.conv : to; 1606 1607 char[to!string(cast(uint)dchar.max).length + 5] _buffer; 1608 size_t _index; 1609 } 1610 1611 import std.format : formattedWrite; 1612 import std.string : representation; 1613 1614 EncodeCharRef retval; 1615 formattedWrite!"&#x%x;$"(retval._buffer[].representation, c); 1616 return retval; 1617 } 1618 1619 /// 1620 version(dxmlTests) unittest 1621 { 1622 import std.algorithm.comparison : equal; 1623 1624 assert(equal(encodeCharRef(' '), " ")); 1625 assert(equal(encodeCharRef('A'), "A")); 1626 assert(equal(encodeCharRef('\u2424'), "␤")); 1627 1628 auto range = encodeCharRef('*'); 1629 assert(parseCharRef(range) == '*'); 1630 } 1631 1632 version(dxmlTests) unittest 1633 { 1634 import std.algorithm.comparison : equal; 1635 1636 enum pound = "#"; 1637 auto range = encodeCharRef('#'); 1638 assert(equal(range.save, range.save)); 1639 assert(equal(range.save, pound)); 1640 }