1/* Part of SWI-Prolog 2 3 Author: Jan Wielemaker 4 E-mail: J.Wielemaker@vu.nl 5 WWW: http://www.swi-prolog.org 6 Copyright (c) 2000-2020, University of Amsterdam 7 VU University Amsterdam 8 CWI, Amsterdam 9 All rights reserved. 10 11 Redistribution and use in source and binary forms, with or without 12 modification, are permitted provided that the following conditions 13 are met: 14 15 1. Redistributions of source code must retain the above copyright 16 notice, this list of conditions and the following disclaimer. 17 18 2. Redistributions in binary form must reproduce the above copyright 19 notice, this list of conditions and the following disclaimer in 20 the documentation and/or other materials provided with the 21 distribution. 22 23 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 24 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 25 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 26 FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 27 COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 28 INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 29 BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 30 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 31 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 32 LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 33 ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 34 POSSIBILITY OF SUCH DAMAGE. 35*/ 36 37:- module(sgml, 38 [ load_html/3, % +Input, -DOM, +Options 39 load_xml/3, % +Input, -DOM, +Options 40 load_sgml/3, % +Input, -DOM, +Options 41 42 load_sgml_file/2, % +File, -ListOfContent 43 load_xml_file/2, % +File, -ListOfContent 44 load_html_file/2, % +File, -Document 45 46 load_structure/3, % +File, -Term, +Options 47 48 load_dtd/2, % +DTD, +File 49 load_dtd/3, % +DTD, +File, +Options 50 dtd/2, % +Type, -DTD 51 dtd_property/2, % +DTD, ?Property 52 53 new_dtd/2, % +Doctype, -DTD 54 free_dtd/1, % +DTD 55 open_dtd/3, % +DTD, +Options, -Stream 56 57 new_sgml_parser/2, % -Parser, +Options 58 free_sgml_parser/1, % +Parser 59 set_sgml_parser/2, % +Parser, +Options 60 get_sgml_parser/2, % +Parser, +Options 61 sgml_parse/2, % +Parser, +Options 62 63 sgml_register_catalog_file/2, % +File, +StartOrEnd 64 65 xml_quote_attribute/3, % +In, -Quoted, +Encoding 66 xml_quote_cdata/3, % +In, -Quoted, +Encoding 67 xml_quote_attribute/2, % +In, -Quoted 68 xml_quote_cdata/2, % +In, -Quoted 69 xml_name/1, % +In 70 xml_name/2, % +In, +Encoding 71 72 xsd_number_string/2, % ?Number, ?String 73 xsd_time_string/3, % ?Term, ?Type, ?String 74 75 xml_basechar/1, % +Code 76 xml_ideographic/1, % +Code 77 xml_combining_char/1, % +Code 78 xml_digit/1, % +Code 79 xml_extender/1, % +Code 80 81 iri_xml_namespace/2, % +IRI, -Namespace 82 iri_xml_namespace/3, % +IRI, -Namespace, -LocalName 83 xml_is_dom/1 % +Term 84 ]). 85:- autoload(library(error),[instantiation_error/1]). 86:- autoload(library(iostream),[open_any/5,close_any/1]). 87:- autoload(library(lists),[member/2,selectchk/3]). 88:- autoload(library(option),[select_option/3,merge_options/3]). 89 90:- meta_predicate 91 load_structure( , , ), 92 load_html( , , ), 93 load_xml( , , ), 94 load_sgml( , , ). 95 96:- predicate_options(load_structure/3, 3, 97 [ charpos(integer), 98 cdata(oneof([atom,string])), 99 defaults(boolean), 100 dialect(oneof([html,html4,html5,sgml,xhtml,xhtml5,xml,xmlns])), 101 doctype(atom), 102 dtd(any), 103 encoding(oneof(['iso-8859-1', 'utf-8', 'us-ascii'])), 104 entity(atom,atom), 105 keep_prefix(boolean), 106 file(atom), 107 line(integer), 108 offset(integer), 109 number(oneof([token,integer])), 110 qualify_attributes(boolean), 111 shorttag(boolean), 112 case_sensitive_attributes(boolean), 113 case_preserving_attributes(boolean), 114 system_entities(boolean), 115 max_memory(integer), 116 space(oneof([sgml,preserve,default,remove,strict])), 117 xmlns(atom), 118 xmlns(atom,atom), 119 pass_to(sgml_parse/2, 2) 120 ]). 121:- predicate_options(load_html/3, 3, 122 [ pass_to(load_structure/3, 3) 123 ]). 124:- predicate_options(load_xml/3, 3, 125 [ pass_to(load_structure/3, 3) 126 ]). 127:- predicate_options(load_sgml/3, 3, 128 [ pass_to(load_structure/3, 3) 129 ]). 130:- predicate_options(load_dtd/3, 3, 131 [ dialect(oneof([sgml,xml,xmlns])), 132 pass_to(open/4, 4) 133 ]). 134:- predicate_options(sgml_parse/2, 2, 135 [ call(oneof([begin,end,cdata,pi,decl,error,xmlns,urlns]), 136 callable), 137 cdata(oneof([atom,string])), 138 content_length(integer), 139 document(-any), 140 max_errors(integer), 141 parse(oneof([file,element,content,declaration,input])), 142 source(any), 143 syntax_errors(oneof([quiet,print,style])), 144 xml_no_ns(oneof([error,quiet])) 145 ]). 146:- predicate_options(new_sgml_parser/2, 2, 147 [ dtd(any) 148 ]).
178:- multifile user:file_search_path/2. 179:- dynamic user:file_search_path/2. 180 181user:file_search_path(dtd, '.'). 182user:file_search_path(dtd, swi('library/DTD')). 183 184sgml_register_catalog_file(File, Location) :- 185 prolog_to_os_filename(File, OsFile), 186 '_sgml_register_catalog_file'(OsFile, Location). 187 188:- use_foreign_library(foreign(sgml2pl)). 189 190register_catalog(Base) :- 191 absolute_file_name(dtd(Base), 192 [ extensions([soc]), 193 access(read), 194 file_errors(fail) 195 ], 196 SocFile), 197 sgml_register_catalog_file(SocFile, end). 198 199:- initialization 200 ignore(register_catalog('HTML4')). 201 202 203 /******************************* 204 * DTD HANDLING * 205 *******************************/ 206 207/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 208Note that concurrent access to DTD objects is not allowed, and hence we 209will allocate and destroy them in each thread. Possibibly it would be 210nicer to find out why concurrent access to DTD's is flawed. It is 211diagnosed to mess with the entity resolution by Fabien Todescato. 212- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */ 213 214:- thread_local 215 current_dtd/2. 216:- volatile 217 current_dtd/2. 218:- thread_local 219 registered_cleanup/0. 220:- volatile 221 registered_cleanup/0. 222 223:- multifile 224 dtd_alias/2. 225 226:- create_prolog_flag(html_dialect, html5, [type(atom)]). 227 228dtd_alias(html4, 'HTML4'). 229dtd_alias(html5, 'HTML5'). 230dtd_alias(html, DTD) :- 231 current_prolog_flag(html_dialect, Dialect), 232 dtd_alias(Dialect, DTD).
dtd(Type)
. Loaded DTD
objects are cached. Note that DTD objects may not be shared
between threads. Therefore, dtd/2 maintains the pool of DTD
objects using a thread_local predicate. DTD objects are
destroyed if a thread terminates.
244dtd(Type, DTD) :- 245 current_dtd(Type, DTD), 246 !. 247dtd(Type, DTD) :- 248 new_dtd(Type, DTD), 249 ( dtd_alias(Type, Base) 250 -> true 251 ; Base = Type 252 ), 253 absolute_file_name(dtd(Base), 254 [ extensions([dtd]), 255 access(read) 256 ], DtdFile), 257 load_dtd(DTD, DtdFile), 258 register_cleanup, 259 asserta(current_dtd(Type, DTD)).
274load_dtd(DTD, DtdFile) :- 275 load_dtd(DTD, DtdFile, []). 276load_dtd(DTD, DtdFile, Options) :- 277 sgml_open_options(sgml:Options, OpenOptions, sgml:DTDOptions), 278 setup_call_cleanup( 279 open_dtd(DTD, DTDOptions, DtdOut), 280 setup_call_cleanup( 281 open(DtdFile, read, DtdIn, OpenOptions), 282 copy_stream_data(DtdIn, DtdOut), 283 close(DtdIn)), 284 close(DtdOut)).
291:- public 292 destroy_dtds/0. % called through register_cleanup/0 293 294destroy_dtds :- 295 ( current_dtd(_Type, DTD), 296 free_dtd(DTD), 297 fail 298 ; true 299 ).
305register_cleanup :- 306 registered_cleanup, 307 !. 308register_cleanup :- 309 ( current_prolog_flag(threads, true) 310 -> prolog_listen(this_thread_exit, destroy_dtds) 311 ; true 312 ), 313 assert(registered_cleanup). 314 315 316 /******************************* 317 * EXAMINE DTD * 318 *******************************/ 319 320prop(doctype(_), _). 321prop(elements(_), _). 322prop(entities(_), _). 323prop(notations(_), _). 324prop(entity(E, _), DTD) :- 325 ( nonvar(E) 326 -> true 327 ; '$dtd_property'(DTD, entities(EL)), 328 member(E, EL) 329 ). 330prop(element(E, _, _), DTD) :- 331 ( nonvar(E) 332 -> true 333 ; '$dtd_property'(DTD, elements(EL)), 334 member(E, EL) 335 ). 336prop(attributes(E, _), DTD) :- 337 ( nonvar(E) 338 -> true 339 ; '$dtd_property'(DTD, elements(EL)), 340 member(E, EL) 341 ). 342prop(attribute(E, A, _, _), DTD) :- 343 ( nonvar(E) 344 -> true 345 ; '$dtd_property'(DTD, elements(EL)), 346 member(E, EL) 347 ), 348 ( nonvar(A) 349 -> true 350 ; '$dtd_property'(DTD, attributes(E, AL)), 351 member(A, AL) 352 ). 353prop(notation(N, _), DTD) :- 354 ( nonvar(N) 355 -> true 356 ; '$dtd_property'(DTD, notations(NL)), 357 member(N, NL) 358 ). 359 360dtd_property(DTD, Prop) :- 361 prop(Prop, DTD), 362 '$dtd_property'(DTD, Prop). 363 364 365 /******************************* 366 * SGML * 367 *******************************/
A proper XML document contains only a single toplevel element whose name matches the document type. Nevertheless, a list is returned for consistency with the representation of element content.
The encoding(+Encoding)
option is treated special for
compatibility reasons:
iso-8859-1
, us-ascii
or utf-8
,
the stream is opened in binary mode and the option is passed
to the SGML parser.391load_structure(Spec, DOM, Options) :- 392 sgml_open_options(Options, OpenOptions, SGMLOptions), 393 setup_call_cleanup( 394 open_any(Spec, read, In, Close, OpenOptions), 395 load_structure_from_stream(In, DOM, SGMLOptions), 396 close_any(Close)). 397 398sgml_open_options(Options, OpenOptions, SGMLOptions) :- 399 Options = M:Plain, 400 ( select_option(encoding(Encoding), Plain, NoEnc) 401 -> ( sgml_encoding(Encoding) 402 -> merge_options(NoEnc, [type(binary)], OpenOptions), 403 SGMLOptions = Options 404 ; OpenOptions = Plain, 405 SGMLOptions = M:NoEnc 406 ) 407 ; merge_options(Plain, [type(binary)], OpenOptions), 408 SGMLOptions = Options 409 ). 410 411sgml_encoding(Enc) :- 412 downcase_atom(Enc, Enc1), 413 sgml_encoding_l(Enc1). 414 415sgml_encoding_l('iso-8859-1'). 416sgml_encoding_l('us-ascii'). 417sgml_encoding_l('utf-8'). 418sgml_encoding_l('utf8'). 419sgml_encoding_l('iso_latin_1'). 420sgml_encoding_l('ascii'). 421 422load_structure_from_stream(In, Term, M:Options) :- 423 ( select_option(dtd(DTD), Options, Options1) 424 -> ExplicitDTD = true 425 ; ExplicitDTD = false, 426 Options1 = Options 427 ), 428 move_front(Options1, dialect(_), Options2), % dialect sets defaults 429 setup_call_cleanup( 430 new_sgml_parser(Parser, 431 [ dtd(DTD) 432 ]), 433 parse(Parser, M:Options2, TermRead, In), 434 free_sgml_parser(Parser)), 435 ( ExplicitDTD == true 436 -> ( DTD = dtd(_, DocType), 437 dtd_property(DTD, doctype(DocType)) 438 -> true 439 ; true 440 ) 441 ; free_dtd(DTD) 442 ), 443 Term = TermRead. 444 445move_front(Options0, Opt, Options) :- 446 selectchk(Opt, Options0, Options1), 447 !, 448 Options = [Opt|Options1]. 449move_front(Options, _, Options). 450 451 452parse(Parser, M:Options, Document, In) :- 453 set_parser_options(Options, Parser, In, Options1), 454 parser_meta_options(Options1, M, Options2), 455 set_input_location(Parser, In), 456 sgml_parse(Parser, 457 [ document(Document), 458 source(In) 459 | Options2 460 ]). 461 462set_parser_options([], _, _, []). 463set_parser_options([H|T], Parser, In, Rest) :- 464 ( set_parser_option(H, Parser, In) 465 -> set_parser_options(T, Parser, In, Rest) 466 ; Rest = [H|R2], 467 set_parser_options(T, Parser, In, R2) 468 ). 469 470set_parser_option(Var, _Parser, _In) :- 471 var(Var), 472 !, 473 instantiation_error(Var). 474set_parser_option(Option, Parser, _) :- 475 def_entity(Option, Parser), 476 !. 477set_parser_option(offset(Offset), _Parser, In) :- 478 !, 479 seek(In, Offset, bof, _). 480set_parser_option(Option, Parser, _In) :- 481 parser_option(Option), 482 !, 483 set_sgml_parser(Parser, Option). 484set_parser_option(Name=Value, Parser, In) :- 485 Option =.. [Name,Value], 486 set_parser_option(Option, Parser, In). 487 488 489parser_option(dialect(_)). 490parser_option(shorttag(_)). 491parser_option(case_sensitive_attributes(_)). 492parser_option(case_preserving_attributes(_)). 493parser_option(system_entities(_)). 494parser_option(max_memory(_)). 495parser_option(file(_)). 496parser_option(line(_)). 497parser_option(space(_)). 498parser_option(number(_)). 499parser_option(defaults(_)). 500parser_option(doctype(_)). 501parser_option(qualify_attributes(_)). 502parser_option(encoding(_)). 503parser_option(keep_prefix(_)). 504 505 506def_entity(entity(Name, Value), Parser) :- 507 get_sgml_parser(Parser, dtd(DTD)), 508 xml_quote_attribute(Value, QValue), 509 setup_call_cleanup(open_dtd(DTD, [], Stream), 510 format(Stream, '<!ENTITY ~w "~w">~n', 511 [Name, QValue]), 512 close(Stream)). 513def_entity(xmlns(URI), Parser) :- 514 set_sgml_parser(Parser, xmlns(URI)). 515def_entity(xmlns(NS, URI), Parser) :- 516 set_sgml_parser(Parser, xmlns(NS, URI)).
522parser_meta_options([], _, []). 523parser_meta_options([call(When, Closure)|T0], M, [call(When, M:Closure)|T]) :- 524 !, 525 parser_meta_options(T0, M, T). 526parser_meta_options([H|T0], M, [H|T]) :- 527 parser_meta_options(T0, M, T).
534set_input_location(Parser, _In) :- 535 get_sgml_parser(Parser, file(_)), 536 !. 537set_input_location(Parser, In) :- 538 stream_property(In, file_name(File)), 539 !, 540 set_sgml_parser(Parser, file(File)), 541 stream_property(In, position(Pos)), 542 set_sgml_parser(Parser, position(Pos)). 543set_input_location(_, _). 544 545 /******************************* 546 * UTILITIES * 547 *******************************/
556load_sgml_file(File, Term) :-
557 load_sgml(File, Term, []).
566load_xml_file(File, Term) :-
567 load_xml(File, Term, []).
576load_html_file(File, DOM) :-
577 load_html(File, DOM, []).
dtd(html, DTD)
.html_dialect
You may also want to use the library(http/http_open) to support loading from HTTP and HTTPS URLs. For example:
:- use_module(library(http/http_open)). :- use_module(library(sgml)). load_html_url(URL, DOM) :- load_html(URL, DOM, []).
606load_html(File, Term, M:Options) :-
607 current_prolog_flag(html_dialect, Dialect),
608 dtd(Dialect, DTD),
609 merge_options(Options,
610 [ dtd(DTD),
611 dialect(Dialect),
612 max_errors(-1),
613 syntax_errors(quiet)
614 ], Options1),
615 load_structure(File, Term, M:Options1).
dialect(xml)
625load_xml(Input, DOM, M:Options) :-
626 merge_options(Options,
627 [ dialect(xml)
628 ], Options1),
629 load_structure(Input, DOM, M:Options1).
dialect(sgml)
639load_sgml(Input, DOM, M:Options) :- 640 merge_options(Options, 641 [ dialect(sgml) 642 ], Options1), 643 load_structure(Input, DOM, M:Options1). 644 645 646 647 /******************************* 648 * ENCODING * 649 *******************************/
659xml_quote_attribute(In, Quoted) :- 660 xml_quote_attribute(In, Quoted, ascii). 661 662xml_quote_cdata(In, Quoted) :- 663 xml_quote_cdata(In, Quoted, ascii).
669xml_name(In) :- 670 xml_name(In, ascii). 671 672 673 /******************************* 674 * XML CHARACTER CLASSES * 675 *******************************/
690 /******************************* 691 * TYPE CHECKING * 692 *******************************/
699xml_is_dom(0) :- !, fail. % catch variables 700xml_is_dom(List) :- 701 is_list(List), 702 !, 703 xml_is_content_list(List). 704xml_is_dom(Term) :- 705 xml_is_element(Term). 706 707xml_is_content_list([]). 708xml_is_content_list([H|T]) :- 709 xml_is_content(H), 710 xml_is_content_list(T). 711 712xml_is_content(0) :- !, fail. 713xml_is_content(pi(Pi)) :- 714 !, 715 atom(Pi). 716xml_is_content(CDATA) :- 717 atom(CDATA), 718 !. 719xml_is_content(CDATA) :- 720 string(CDATA), 721 !. 722xml_is_content(Term) :- 723 xml_is_element(Term). 724 725xml_is_element(element(Name, Attributes, Content)) :- 726 dom_name(Name), 727 dom_attributes(Attributes), 728 xml_is_content_list(Content). 729 730dom_name(NS:Local) :- 731 atom(NS), 732 atom(Local), 733 !. 734dom_name(Local) :- 735 atom(Local). 736 737dom_attributes(0) :- !, fail. 738dom_attributes([]). 739dom_attributes([H|T]) :- 740 dom_attribute(H), 741 dom_attributes(T). 742 743dom_attribute(Name=Value) :- 744 dom_name(Name), 745 atomic(Value). 746 747 748 /******************************* 749 * MESSAGES * 750 *******************************/ 751:- multifile 752 prolog:message/3. 753 754% Catch messages. sgml/4 is generated by the SGML2PL binding. 755 756prologmessage(sgml(Parser, File, Line, Message)) --> 757 { get_sgml_parser(Parser, dialect(Dialect)) 758 }, 759 [ 'SGML2PL(~w): ~w:~w: ~w'-[Dialect, File, Line, Message] ]. 760 761 762 /******************************* 763 * XREF SUPPORT * 764 *******************************/ 765 766:- multifile 767 prolog:called_by/2. 768 769prologcalled_by(sgml_parse(_, Options), Called) :- 770 findall(Meta, meta_call_term(_, Meta, Options), Called). 771 772meta_call_term(T, G+N, Options) :- 773 T = call(Event, G), 774 pmember(T, Options), 775 call_params(Event, Term), 776 functor(Term, _, N). 777 778pmember(X, List) :- % member for partial lists 779 nonvar(List), 780 List = [H|T], 781 ( X = H 782 ; pmember(X, T) 783 ). 784 785call_params(begin, begin(tag,attributes,parser)). 786call_params(end, end(tag,parser)). 787call_params(cdata, cdata(cdata,parser)). 788call_params(pi, pi(cdata,parser)). 789call_params(decl, decl(cdata,parser)). 790call_params(error, error(severity,message,parser)). 791call_params(xmlns, xmlns(namespace,url,parser)). 792call_params(urlns, urlns(url,url,parser)). 793 794 /******************************* 795 * SANDBOX * 796 *******************************/ 797 798:- multifile 799 sandbox:safe_primitive/1, 800 sandbox:safe_meta_predicate/1. 801 802sandbox:safe_meta_predicate(sgml:load_structure/3). 803sandbox:safe_primitive(sgml:dtd(Dialect, _)) :- 804 dtd_alias(Dialect, _). 805sandbox:safe_primitive(sgml:xml_quote_attribute(_,_,_)). 806sandbox:safe_primitive(sgml:xml_quote_cdata(_,_,_)). 807sandbox:safe_primitive(sgml:xml_name(_,_)). 808sandbox:safe_primitive(sgml:xml_basechar(_)). 809sandbox:safe_primitive(sgml:xml_ideographic(_)). 810sandbox:safe_primitive(sgml:xml_combining_char(_)). 811sandbox:safe_primitive(sgml:xml_digit(_)). 812sandbox:safe_primitive(sgml:xml_extender(_)). 813sandbox:safe_primitive(sgml:iri_xml_namespace(_,_,_)). 814sandbox:safe_primitive(sgml:xsd_number_string(_,_)). 815sandbox:safe_primitive(sgml:xsd_time_string(_,_,_))
SGML, XML and HTML parser
This library allows you to parse SGML, XML and HTML data into a Prolog data structure. The library defines several families of predicates:
The DOM structure can be used by library(xpath) to extract information from the document.