36
37:- module(sgml,
38 [ load_html/3, 39 load_xml/3, 40 load_sgml/3, 41
42 load_sgml_file/2, 43 load_xml_file/2, 44 load_html_file/2, 45
46 load_structure/3, 47
48 load_dtd/2, 49 load_dtd/3, 50 dtd/2, 51 dtd_property/2, 52
53 new_dtd/2, 54 free_dtd/1, 55 open_dtd/3, 56
57 new_sgml_parser/2, 58 free_sgml_parser/1, 59 set_sgml_parser/2, 60 get_sgml_parser/2, 61 sgml_parse/2, 62
63 sgml_register_catalog_file/2, 64
65 xml_quote_attribute/3, 66 xml_quote_cdata/3, 67 xml_quote_attribute/2, 68 xml_quote_cdata/2, 69 xml_name/1, 70 xml_name/2, 71
72 xsd_number_string/2, 73 xsd_time_string/3, 74
75 xml_basechar/1, 76 xml_ideographic/1, 77 xml_combining_char/1, 78 xml_digit/1, 79 xml_extender/1, 80
81 iri_xml_namespace/2, 82 iri_xml_namespace/3, 83 xml_is_dom/1 84 ]). 85:- autoload(library(error),[instantiation_error/1]). 86:- autoload(library(iostream),[open_any/5,close_any/1]). 87:- autoload(library(lists),[member/2,selectchk/3]). 88:- autoload(library(option),[select_option/3,merge_options/3]). 89
90:- meta_predicate
91 load_structure(+, -, :),
92 load_html(+, -, :),
93 load_xml(+, -, :),
94 load_sgml(+, -, :). 95
96:- predicate_options(load_structure/3, 3,
97 [ charpos(integer),
98 cdata(oneof([atom,string])),
99 defaults(boolean),
100 dialect(oneof([html,html4,html5,sgml,xhtml,xhtml5,xml,xmlns])),
101 doctype(atom),
102 dtd(any),
103 encoding(oneof(['iso-8859-1', 'utf-8', 'us-ascii'])),
104 entity(atom,atom),
105 keep_prefix(boolean),
106 file(atom),
107 line(integer),
108 offset(integer),
109 number(oneof([token,integer])),
110 qualify_attributes(boolean),
111 shorttag(boolean),
112 case_sensitive_attributes(boolean),
113 case_preserving_attributes(boolean),
114 system_entities(boolean),
115 max_memory(integer),
116 space(oneof([sgml,preserve,default,remove,strict])),
117 xmlns(atom),
118 xmlns(atom,atom),
119 pass_to(sgml_parse/2, 2)
120 ]). 121:- predicate_options(load_html/3, 3,
122 [ pass_to(load_structure/3, 3)
123 ]). 124:- predicate_options(load_xml/3, 3,
125 [ pass_to(load_structure/3, 3)
126 ]). 127:- predicate_options(load_sgml/3, 3,
128 [ pass_to(load_structure/3, 3)
129 ]). 130:- predicate_options(load_dtd/3, 3,
131 [ dialect(oneof([sgml,xml,xmlns])),
132 pass_to(open/4, 4)
133 ]). 134:- predicate_options(sgml_parse/2, 2,
135 [ call(oneof([begin,end,cdata,pi,decl,error,xmlns,urlns]),
136 callable),
137 cdata(oneof([atom,string])),
138 content_length(integer),
139 document(-any),
140 max_errors(integer),
141 parse(oneof([file,element,content,declaration,input])),
142 source(any),
143 syntax_errors(oneof([quiet,print,style])),
144 xml_no_ns(oneof([error,quiet]))
145 ]). 146:- predicate_options(new_sgml_parser/2, 2,
147 [ dtd(any)
148 ]). 149
150
177
178:- multifile user:file_search_path/2. 179:- dynamic user:file_search_path/2. 180
181user:file_search_path(dtd, '.').
182user:file_search_path(dtd, swi('library/DTD')).
183
184sgml_register_catalog_file(File, Location) :-
185 prolog_to_os_filename(File, OsFile),
186 '_sgml_register_catalog_file'(OsFile, Location).
187
188:- use_foreign_library(foreign(sgml2pl)). 189
190register_catalog(Base) :-
191 absolute_file_name(dtd(Base),
192 [ extensions([soc]),
193 access(read),
194 file_errors(fail)
195 ],
196 SocFile),
197 sgml_register_catalog_file(SocFile, end).
198
199:- initialization
200 ignore(register_catalog('HTML4')). 201
202
203 206
213
214:- thread_local
215 current_dtd/2. 216:- volatile
217 current_dtd/2. 218:- thread_local
219 registered_cleanup/0. 220:- volatile
221 registered_cleanup/0. 222
223:- multifile
224 dtd_alias/2. 225
226:- create_prolog_flag(html_dialect, html5, [type(atom)]). 227
228dtd_alias(html4, 'HTML4').
229dtd_alias(html5, 'HTML5').
230dtd_alias(html, DTD) :-
231 current_prolog_flag(html_dialect, Dialect),
232 dtd_alias(Dialect, DTD).
233
243
244dtd(Type, DTD) :-
245 current_dtd(Type, DTD),
246 !.
247dtd(Type, DTD) :-
248 new_dtd(Type, DTD),
249 ( dtd_alias(Type, Base)
250 -> true
251 ; Base = Type
252 ),
253 absolute_file_name(dtd(Base),
254 [ extensions([dtd]),
255 access(read)
256 ], DtdFile),
257 load_dtd(DTD, DtdFile),
258 register_cleanup,
259 asserta(current_dtd(Type, DTD)).
260
273
274load_dtd(DTD, DtdFile) :-
275 load_dtd(DTD, DtdFile, []).
276load_dtd(DTD, DtdFile, Options) :-
277 sgml_open_options(sgml:Options, OpenOptions, sgml:DTDOptions),
278 setup_call_cleanup(
279 open_dtd(DTD, DTDOptions, DtdOut),
280 setup_call_cleanup(
281 open(DtdFile, read, DtdIn, OpenOptions),
282 copy_stream_data(DtdIn, DtdOut),
283 close(DtdIn)),
284 close(DtdOut)).
285
290
291:- public
292 destroy_dtds/0. 293
294destroy_dtds :-
295 ( current_dtd(_Type, DTD),
296 free_dtd(DTD),
297 fail
298 ; true
299 ).
300
304
305register_cleanup :-
306 registered_cleanup,
307 !.
308register_cleanup :-
309 ( current_prolog_flag(threads, true)
310 -> prolog_listen(this_thread_exit, destroy_dtds)
311 ; true
312 ),
313 assert(registered_cleanup).
314
315
316 319
320prop(doctype(_), _).
321prop(elements(_), _).
322prop(entities(_), _).
323prop(notations(_), _).
324prop(entity(E, _), DTD) :-
325 ( nonvar(E)
326 -> true
327 ; '$dtd_property'(DTD, entities(EL)),
328 member(E, EL)
329 ).
330prop(element(E, _, _), DTD) :-
331 ( nonvar(E)
332 -> true
333 ; '$dtd_property'(DTD, elements(EL)),
334 member(E, EL)
335 ).
336prop(attributes(E, _), DTD) :-
337 ( nonvar(E)
338 -> true
339 ; '$dtd_property'(DTD, elements(EL)),
340 member(E, EL)
341 ).
342prop(attribute(E, A, _, _), DTD) :-
343 ( nonvar(E)
344 -> true
345 ; '$dtd_property'(DTD, elements(EL)),
346 member(E, EL)
347 ),
348 ( nonvar(A)
349 -> true
350 ; '$dtd_property'(DTD, attributes(E, AL)),
351 member(A, AL)
352 ).
353prop(notation(N, _), DTD) :-
354 ( nonvar(N)
355 -> true
356 ; '$dtd_property'(DTD, notations(NL)),
357 member(N, NL)
358 ).
359
360dtd_property(DTD, Prop) :-
361 prop(Prop, DTD),
362 '$dtd_property'(DTD, Prop).
363
364
365 368
390
391load_structure(Spec, DOM, Options) :-
392 sgml_open_options(Options, OpenOptions, SGMLOptions),
393 setup_call_cleanup(
394 open_any(Spec, read, In, Close, OpenOptions),
395 load_structure_from_stream(In, DOM, SGMLOptions),
396 close_any(Close)).
397
398sgml_open_options(Options, OpenOptions, SGMLOptions) :-
399 Options = M:Plain,
400 ( select_option(encoding(Encoding), Plain, NoEnc)
401 -> ( sgml_encoding(Encoding)
402 -> merge_options(NoEnc, [type(binary)], OpenOptions),
403 SGMLOptions = Options
404 ; OpenOptions = Plain,
405 SGMLOptions = M:NoEnc
406 )
407 ; merge_options(Plain, [type(binary)], OpenOptions),
408 SGMLOptions = Options
409 ).
410
411sgml_encoding(Enc) :-
412 downcase_atom(Enc, Enc1),
413 sgml_encoding_l(Enc1).
414
415sgml_encoding_l('iso-8859-1').
416sgml_encoding_l('us-ascii').
417sgml_encoding_l('utf-8').
418sgml_encoding_l('utf8').
419sgml_encoding_l('iso_latin_1').
420sgml_encoding_l('ascii').
421
422load_structure_from_stream(In, Term, M:Options) :-
423 ( select_option(dtd(DTD), Options, Options1)
424 -> ExplicitDTD = true
425 ; ExplicitDTD = false,
426 Options1 = Options
427 ),
428 move_front(Options1, dialect(_), Options2), 429 setup_call_cleanup(
430 new_sgml_parser(Parser,
431 [ dtd(DTD)
432 ]),
433 parse(Parser, M:Options2, TermRead, In),
434 free_sgml_parser(Parser)),
435 ( ExplicitDTD == true
436 -> ( DTD = dtd(_, DocType),
437 dtd_property(DTD, doctype(DocType))
438 -> true
439 ; true
440 )
441 ; free_dtd(DTD)
442 ),
443 Term = TermRead.
444
445move_front(Options0, Opt, Options) :-
446 selectchk(Opt, Options0, Options1),
447 !,
448 Options = [Opt|Options1].
449move_front(Options, _, Options).
450
451
452parse(Parser, M:Options, Document, In) :-
453 set_parser_options(Options, Parser, In, Options1),
454 parser_meta_options(Options1, M, Options2),
455 set_input_location(Parser, In),
456 sgml_parse(Parser,
457 [ document(Document),
458 source(In)
459 | Options2
460 ]).
461
462set_parser_options([], _, _, []).
463set_parser_options([H|T], Parser, In, Rest) :-
464 ( set_parser_option(H, Parser, In)
465 -> set_parser_options(T, Parser, In, Rest)
466 ; Rest = [H|R2],
467 set_parser_options(T, Parser, In, R2)
468 ).
469
470set_parser_option(Var, _Parser, _In) :-
471 var(Var),
472 !,
473 instantiation_error(Var).
474set_parser_option(Option, Parser, _) :-
475 def_entity(Option, Parser),
476 !.
477set_parser_option(offset(Offset), _Parser, In) :-
478 !,
479 seek(In, Offset, bof, _).
480set_parser_option(Option, Parser, _In) :-
481 parser_option(Option),
482 !,
483 set_sgml_parser(Parser, Option).
484set_parser_option(Name=Value, Parser, In) :-
485 Option =.. [Name,Value],
486 set_parser_option(Option, Parser, In).
487
488
489parser_option(dialect(_)).
490parser_option(shorttag(_)).
491parser_option(case_sensitive_attributes(_)).
492parser_option(case_preserving_attributes(_)).
493parser_option(system_entities(_)).
494parser_option(max_memory(_)).
495parser_option(file(_)).
496parser_option(line(_)).
497parser_option(space(_)).
498parser_option(number(_)).
499parser_option(defaults(_)).
500parser_option(doctype(_)).
501parser_option(qualify_attributes(_)).
502parser_option(encoding(_)).
503parser_option(keep_prefix(_)).
504
505
506def_entity(entity(Name, Value), Parser) :-
507 get_sgml_parser(Parser, dtd(DTD)),
508 xml_quote_attribute(Value, QValue),
509 setup_call_cleanup(open_dtd(DTD, [], Stream),
510 format(Stream, '<!ENTITY ~w "~w">~n',
511 [Name, QValue]),
512 close(Stream)).
513def_entity(xmlns(URI), Parser) :-
514 set_sgml_parser(Parser, xmlns(URI)).
515def_entity(xmlns(NS, URI), Parser) :-
516 set_sgml_parser(Parser, xmlns(NS, URI)).
517
521
522parser_meta_options([], _, []).
523parser_meta_options([call(When, Closure)|T0], M, [call(When, M:Closure)|T]) :-
524 !,
525 parser_meta_options(T0, M, T).
526parser_meta_options([H|T0], M, [H|T]) :-
527 parser_meta_options(T0, M, T).
528
529
533
534set_input_location(Parser, _In) :-
535 get_sgml_parser(Parser, file(_)),
536 !.
537set_input_location(Parser, In) :-
538 stream_property(In, file_name(File)),
539 !,
540 set_sgml_parser(Parser, file(File)),
541 stream_property(In, position(Pos)),
542 set_sgml_parser(Parser, position(Pos)).
543set_input_location(_, _).
544
545 548
555
556load_sgml_file(File, Term) :-
557 load_sgml(File, Term, []).
558
565
566load_xml_file(File, Term) :-
567 load_xml(File, Term, []).
568
575
576load_html_file(File, DOM) :-
577 load_html(File, DOM, []).
578
605
606load_html(File, Term, M:Options) :-
607 current_prolog_flag(html_dialect, Dialect),
608 dtd(Dialect, DTD),
609 merge_options(Options,
610 [ dtd(DTD),
611 dialect(Dialect),
612 max_errors(-1),
613 syntax_errors(quiet)
614 ], Options1),
615 load_structure(File, Term, M:Options1).
616
624
625load_xml(Input, DOM, M:Options) :-
626 merge_options(Options,
627 [ dialect(xml)
628 ], Options1),
629 load_structure(Input, DOM, M:Options1).
630
638
639load_sgml(Input, DOM, M:Options) :-
640 merge_options(Options,
641 [ dialect(sgml)
642 ], Options1),
643 load_structure(Input, DOM, M:Options1).
644
645
646
647 650
658
659xml_quote_attribute(In, Quoted) :-
660 xml_quote_attribute(In, Quoted, ascii).
661
662xml_quote_cdata(In, Quoted) :-
663 xml_quote_cdata(In, Quoted, ascii).
664
668
669xml_name(In) :-
670 xml_name(In, ascii).
671
672
673 676
688
689
690 693
698
699xml_is_dom(0) :- !, fail. 700xml_is_dom(List) :-
701 is_list(List),
702 !,
703 xml_is_content_list(List).
704xml_is_dom(Term) :-
705 xml_is_element(Term).
706
707xml_is_content_list([]).
708xml_is_content_list([H|T]) :-
709 xml_is_content(H),
710 xml_is_content_list(T).
711
712xml_is_content(0) :- !, fail.
713xml_is_content(pi(Pi)) :-
714 !,
715 atom(Pi).
716xml_is_content(CDATA) :-
717 atom(CDATA),
718 !.
719xml_is_content(CDATA) :-
720 string(CDATA),
721 !.
722xml_is_content(Term) :-
723 xml_is_element(Term).
724
725xml_is_element(element(Name, Attributes, Content)) :-
726 dom_name(Name),
727 dom_attributes(Attributes),
728 xml_is_content_list(Content).
729
730dom_name(NS:Local) :-
731 atom(NS),
732 atom(Local),
733 !.
734dom_name(Local) :-
735 atom(Local).
736
737dom_attributes(0) :- !, fail.
738dom_attributes([]).
739dom_attributes([H|T]) :-
740 dom_attribute(H),
741 dom_attributes(T).
742
743dom_attribute(Name=Value) :-
744 dom_name(Name),
745 atomic(Value).
746
747
748 751:- multifile
752 prolog:message/3. 753
755
756prolog:message(sgml(Parser, File, Line, Message)) -->
757 { get_sgml_parser(Parser, dialect(Dialect))
758 },
759 [ 'SGML2PL(~w): ~w:~w: ~w'-[Dialect, File, Line, Message] ].
760
761
762 765
766:- multifile
767 prolog:called_by/2. 768
769prolog:called_by(sgml_parse(_, Options), Called) :-
770 findall(Meta, meta_call_term(_, Meta, Options), Called).
771
772meta_call_term(T, G+N, Options) :-
773 T = call(Event, G),
774 pmember(T, Options),
775 call_params(Event, Term),
776 functor(Term, _, N).
777
778pmember(X, List) :- 779 nonvar(List),
780 List = [H|T],
781 ( X = H
782 ; pmember(X, T)
783 ).
784
785call_params(begin, begin(tag,attributes,parser)).
786call_params(end, end(tag,parser)).
787call_params(cdata, cdata(cdata,parser)).
788call_params(pi, pi(cdata,parser)).
789call_params(decl, decl(cdata,parser)).
790call_params(error, error(severity,message,parser)).
791call_params(xmlns, xmlns(namespace,url,parser)).
792call_params(urlns, urlns(url,url,parser)).
793
794 797
798:- multifile
799 sandbox:safe_primitive/1,
800 sandbox:safe_meta_predicate/1. 801
802sandbox:safe_meta_predicate(sgml:load_structure/3).
803sandbox:safe_primitive(sgml:dtd(Dialect, _)) :-
804 dtd_alias(Dialect, _).
805sandbox:safe_primitive(sgml:xml_quote_attribute(_,_,_)).
806sandbox:safe_primitive(sgml:xml_quote_cdata(_,_,_)).
807sandbox:safe_primitive(sgml:xml_name(_,_)).
808sandbox:safe_primitive(sgml:xml_basechar(_)).
809sandbox:safe_primitive(sgml:xml_ideographic(_)).
810sandbox:safe_primitive(sgml:xml_combining_char(_)).
811sandbox:safe_primitive(sgml:xml_digit(_)).
812sandbox:safe_primitive(sgml:xml_extender(_)).
813sandbox:safe_primitive(sgml:iri_xml_namespace(_,_,_)).
814sandbox:safe_primitive(sgml:xsd_number_string(_,_)).
815sandbox:safe_primitive(sgml:xsd_time_string(_,_,_))