View source with raw comments or as raw

    1/*  Part of SWI-Prolog
    2
    3    Author:        Jan Wielemaker
    4    E-mail:        J.Wielemaker@vu.nl
    5    WWW:           http://www.swi-prolog.org
    6    Copyright (c)  2009-2018, VU University Amsterdam
    7                              CWI, Amsterdam
    8    All rights reserved.
    9
   10    Redistribution and use in source and binary forms, with or without
   11    modification, are permitted provided that the following conditions
   12    are met:
   13
   14    1. Redistributions of source code must retain the above copyright
   15       notice, this list of conditions and the following disclaimer.
   16
   17    2. Redistributions in binary form must reproduce the above copyright
   18       notice, this list of conditions and the following disclaimer in
   19       the documentation and/or other materials provided with the
   20       distribution.
   21
   22    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
   23    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
   24    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
   25    FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
   26    COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
   27    INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
   28    BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
   29    LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
   30    CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   31    LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
   32    ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
   33    POSSIBILITY OF SUCH DAMAGE.
   34*/
   35
   36:- module(csv,
   37          [ csv//1,                     % +Rows
   38            csv//2,                     % +Rows, +Options
   39
   40            csv_read_file/2,            % +File, -Data
   41            csv_read_file/3,            % +File, -Data, +Options
   42            csv_read_stream/3,          % +Stream, -Data, +Options
   43
   44            csv_read_file_row/3,        % +File, -Row, +Options
   45            csv_read_row/3,		% +Stream, -Row, +CompiledOptions
   46            csv_options/2,		% -Compiled, +Options
   47
   48            csv_write_file/2,           % +File, +Data
   49            csv_write_file/3,           % +File, +Data, +Options
   50            csv_write_stream/3          % +Stream, +Data, +Options
   51          ]).   52:- use_module(library(record),[(record)/1, op(_,_,record)]).   53
   54:- autoload(library(apply),[maplist/2]).   55:- autoload(library(debug),[debug/3]).   56:- autoload(library(error),[must_be/2,domain_error/2]).   57:- autoload(library(lists),[append/3]).   58:- autoload(library(option),[option/2,select_option/4]).   59:- autoload(library(pure_input),
   60	    [phrase_from_file/3,phrase_from_stream/2]).   61:- autoload(library(readutil),[read_line_to_codes/2]).   62:- autoload(library(dcg/basics),[string//1,eos//0]).

Process CSV (Comma-Separated Values) data

This library parses and generates CSV data. CSV data is represented in Prolog as a list of rows. Each row is a compound term, where all rows have the same name and arity.

See also: - RFC 4180 */
To be done: - Implement immediate assert of the data to avoid possible stack overflows.; - Writing creates an intermediate code-list, possibly overflowing resources. This waits for pure output!

   78:- predicate_options(csv//2, 2,
   79                     [ separator(nonneg),       % mustv be code
   80                       strip(boolean),
   81                       ignore_quotes(boolean),
   82                       convert(boolean),
   83                       case(oneof([down,preserve,up])),
   84                       functor(atom),
   85                       arity(-nonneg),          % actually ?nonneg
   86                       match_arity(boolean)
   87                     ]).   88:- predicate_options(csv_read_file/3, 3,
   89                     [ pass_to(csv//2, 2),
   90                       pass_to(phrase_from_file/3, 3)
   91                     ]).   92:- predicate_options(csv_read_file_row/3, 3,
   93                     [ pass_to(csv//2, 2),
   94                       pass_to(open/4, 4)
   95                     ]).   96:- predicate_options(csv_write_file/3, 3,
   97                     [ pass_to(csv//2, 2),
   98                       pass_to(open/4, 4)
   99                     ]).  100:- predicate_options(csv_write_stream/3, 3,
  101                     [ pass_to(csv//2, 2)
  102                     ]).  103
  104
  105:- record
  106    csv_options(separator:integer=0',,
  107                strip:boolean=false,
  108                ignore_quotes:boolean=false,
  109                convert:boolean=true,
  110                case:oneof([down,preserve,up])=preserve,
  111                functor:atom=row,
  112                arity:integer,
  113                match_arity:boolean=true,
  114                skip_header:atom).

csv_read_file(+File, -Rows) is det

csv_read_file(+File, -Rows, +Options) is det

Read a CSV file into a list of rows. Each row is a Prolog term with the same arity. Options is handed to csv//2. Remaining options are processed by phrase_from_file/3. The default separator depends on the file name extension and is \t for .tsv files and , otherwise.

Suppose we want to create a predicate table/6 from a CSV file that we know contains 6 fields per record. This can be done using the code below. Without the option arity(6), this would generate a predicate table/N, where N is the number of fields per record in the data.

?- csv_read_file(File, Rows, [functor(table), arity(6)]),
   maplist(assert, Rows).

  138csv_read_file(File, Rows) :-
  139    csv_read_file(File, Rows, []).
  140
  141csv_read_file(File, Rows, Options) :-
  142    default_separator(File, Options, Options1),
  143    make_csv_options(Options1, Record, RestOptions),
  144    phrase_from_file(csv_roptions(Rows, Record), File, RestOptions).
  145
  146
  147default_separator(File, Options0, Options) :-
  148    (   option(separator(_), Options0)
  149    ->  Options = Options0
  150    ;   file_name_extension(_, Ext0, File),
  151        downcase_atom(Ext0, Ext),
  152        ext_separator(Ext, Sep)
  153    ->  Options = [separator(Sep)|Options0]
  154    ;   Options = Options0
  155    ).
  156
  157ext_separator(csv, 0',).
  158ext_separator(tsv, 0'\t).

csv_read_stream(+Stream, -Rows, +Options) is det: Read CSV data from Stream. See also csv_read_row/3.

  165csv_read_stream(Stream, Rows, Options) :-
  166    make_csv_options(Options, Record, _),
  167    phrase_from_stream(csv_roptions(Rows, Record), Stream).

csv(?Rows)// is det

csv(?Rows, +Options)// is det

Prolog DCG to `read/write' CSV data. Options:

separator(+Code): The comma-separator. Must be a character code. Default is (of course) the comma. Character codes can be specified using the 0' notion. E.g., using separator(0';) parses a semicolon separated file.
ignore_quotes(+Boolean): If true (default false), threat double quotes as a normal character.
strip(+Boolean): If true (default false), strip leading and trailing blank space. RFC4180 says that blank space is part of the data.
skip_header(+CommentLead): Skip leading lines that start with CommentLead. There is no standard for comments in CSV files, but some CSV files have a header where each line starts with #. After skipping comment lines this option causes csv//2 to skip empty lines. Note that an empty line may not contain white space characters (space or tab) as these may provide valid data.
convert(+Boolean): If true (default), use name/2 on the field data. This translates the field into a number if possible.
case(+Action): If down, downcase atomic values. If up, upcase them and if preserve (default), do not change the case.
functor(+Atom): Functor to use for creating row terms. Default is row.
arity(?Arity): Number of fields in each row. This predicate raises a domain_error(row_arity(Expected), Found) if a row is found with different arity.
match_arity(+Boolean): If false (default true), do not reject CSV files where lines provide a varying number of fields (columns). This can be a work-around to use some incorrect CSV files.

  219csv(Rows) -->
  220    csv(Rows, []).
  221
  222csv(Rows, Options) -->
  223    { make_csv_options(Options, Record, _) },
  224    csv_roptions(Rows, Record).
  225
  226csv_roptions(Rows, Record) -->
  227    { ground(Rows) },
  228    !,
  229    emit_csv(Rows, Record).
  230csv_roptions(Rows, Record) -->
  231    skip_header(Record),
  232    csv_data(Rows, Record).
  233
  234skip_header(Options) -->
  235    { csv_options_skip_header(Options, CommentStart),
  236      nonvar(CommentStart),
  237      atom_codes(CommentStart, Codes)
  238    },
  239    !,
  240    skip_header_lines(Codes),
  241    skip_blank_lines.
  242skip_header(_) -->
  243    [].
  244
  245skip_header_lines(CommentStart) -->
  246    string(CommentStart),
  247    !,
  248    (   string(_Comment),
  249        end_of_record
  250    ->  skip_header_lines(CommentStart)
  251    ).
  252skip_header_lines(_) -->
  253    [].
  254
  255skip_blank_lines -->
  256    eos,
  257    !.
  258skip_blank_lines -->
  259    end_of_record,
  260    !,
  261    skip_blank_lines.
  262skip_blank_lines -->
  263    [].
  264
  265csv_data([], _) -->
  266    eos,
  267    !.
  268csv_data([Row|More], Options) -->
  269    row(Row, Options),
  270    !,
  271    { debug(csv, 'Row: ~p', [Row]) },
  272    csv_data(More, Options).
  273
  274
  275row(Row, Options) -->
  276    fields(Fields, Options),
  277    { csv_options_functor(Options, Functor),
  278      Row =.. [Functor|Fields],
  279      functor(Row, _, Arity),
  280      check_arity(Options, Arity)
  281    }.
  282
  283check_arity(Options, Arity) :-
  284    csv_options_arity(Options, Arity),
  285    !.
  286check_arity(Options, _) :-
  287    csv_options_match_arity(Options, false),
  288    !.
  289check_arity(Options, Arity) :-
  290    csv_options_arity(Options, Expected),
  291    domain_error(row_arity(Expected), Arity).
  292
  293fields([F|T], Options) -->
  294    field(F, Options),
  295    (   separator(Options)
  296    ->  fields(T, Options)
  297    ;   end_of_record
  298    ->  { T = [] }
  299    ).
  300
  301field(Value, Options) -->
  302    "\"",
  303    { csv_options_ignore_quotes(Options, false) },
  304    !,
  305    string_codes(Codes),
  306    { make_value(Codes, Value, Options) }.
  307field(Value, Options) -->
  308    { csv_options_strip(Options, true) },
  309    !,
  310    stripped_field(Value, Options).
  311field(Value, Options) -->
  312    { csv_options_separator(Options, Sep) },
  313    field_codes(Codes, Sep),
  314    { make_value(Codes, Value, Options) }.
  315
  316
  317stripped_field(Value, Options) -->
  318    ws,
  319    (   "\"",
  320        { csv_options_strip(Options, false) }
  321    ->  string_codes(Codes),
  322        ws
  323    ;   { csv_options_separator(Options, Sep) },
  324        field_codes(Codes0, Sep),
  325        { strip_trailing_ws(Codes0, Codes) }
  326    ),
  327    { make_value(Codes, Value, Options) }.
  328
  329ws --> " ", !, ws.
  330ws --> "\t", !, ws.
  331ws --> "".
  332
  333strip_trailing_ws(List, Stripped) :-
  334    append(Stripped, WS, List),
  335    all_ws(WS).
  336
  337all_ws([]).
  338all_ws([32|T]) :- all_ws(T).
  339all_ws([9|T]) :- all_ws(T).

string_codes(-Codes): Process a double-quotes string where the quote is escaped by doubling it. Eats the terminating double-quote.

  347string_codes(List) -->
  348    [H],
  349    (   { H == 0'" }
  350    ->  (   "\""
  351        ->  { List = [H|T] },
  352            string_codes(T)
  353        ;   { List = [] }
  354        )
  355    ;   { List = [H|T] },
  356        string_codes(T)
  357    ).
  358
  359field_codes([], Sep), [Sep] --> [Sep], !.
  360field_codes([], _), "\n" --> "\r\n", !.
  361field_codes([], _), "\n" --> "\n", !.
  362field_codes([], _), "\n" --> "\r", !.
  363field_codes([H|T], Sep) --> [H], !, field_codes(T, Sep).
  364field_codes([], _) --> [].              % unterminated last record

make_value(+Codes, -Value, +Options) is det: Convert a list of character codes to the actual value, depending on Options.

  371make_value(Codes, Value, Options) :-
  372    csv_options_convert(Options, Convert),
  373    csv_options_case(Options, Case),
  374    make_value(Convert, Case, Codes, Value).
  375
  376make_value(true, preserve, Codes, Value) :-
  377    !,
  378    name(Value, Codes).
  379make_value(true, Case, Codes, Value) :-
  380    !,
  381    (   number_string(Value, Codes)
  382    ->  true
  383    ;   make_value(false, Case, Codes, Value)
  384    ).
  385make_value(false, preserve, Codes, Value) :-
  386    !,
  387    atom_codes(Value, Codes).
  388make_value(false, down, Codes, Value) :-
  389    !,
  390    string_codes(String, Codes),
  391    downcase_atom(String, Value).
  392make_value(false, up, Codes, Value) :-
  393    string_codes(String, Codes),
  394    upcase_atom(String, Value).
  395
  396separator(Options) -->
  397    { csv_options_separator(Options, Sep) },
  398    [Sep].
  399
  400end_of_record --> "\n".			% Unix files
  401end_of_record --> "\r\n".               % DOS files
  402end_of_record --> "\r".                 % MacOS files
  403end_of_record --> eos.                  % unterminated last record

csv_read_file_row(+File, -Row, +Options) is nondet

True when Row is a row in File. First unifies Row with the first row in File. Backtracking yields the second, ... row. This interface is an alternative to csv_read_file/3 that avoids loading all rows in memory. Note that this interface does not guarantee that all rows in File have the same arity.

In addition to the options of csv_read_file/3, this predicate processes the option:

line(-Line): Line is unified with the 1-based line-number from which Row is read. Note that Line is not the physical line, but rather the logical record number.

  422csv_read_file_row(File, Row, Options) :-
  423    default_separator(File, Options, Options1),
  424    make_csv_options(Options1, RecordOptions, Options2),
  425    select_option(line(Line), Options2, RestOptions, _),
  426    setup_call_cleanup(
  427        open(File, read, Stream, RestOptions),
  428        csv_read_stream_row(Stream, Row, Line, RecordOptions),
  429        close(Stream)).
  430
  431csv_read_stream_row(Stream, Row, Line, Options) :-
  432    between(1, infinite, Line),
  433    (   csv_read_row(Stream, Row0, Options),
  434        Row0 \== end_of_file
  435    ->  Row = Row0
  436    ;   !,
  437        fail
  438    ).

csv_read_row(+Stream, -Row, +CompiledOptions) is det: Read the next CSV record from Stream and unify the result with Row. CompiledOptions is created from options defined for csv//2 using csv_options/2. Row is unified with end_of_file upon reaching the end of the input.

  448csv_read_row(Stream, Row, _Record) :-
  449    at_end_of_stream(Stream),
  450    !,
  451    Row = end_of_file.
  452csv_read_row(Stream, Row, Record) :-
  453    read_lines_to_codes(Stream, Codes, Record, even),
  454    phrase(row(Row0, Record), Codes),
  455    !,
  456    Row = Row0.
  457
  458read_lines_to_codes(Stream, Codes, Options, QuoteQuantity) :-
  459    read_line_to_codes(Stream, Codes0),
  460    Codes0 \== end_of_file,
  461    (   (   csv_options_ignore_quotes(Options, true)
  462        ;   check_quotes(Codes0, QuoteQuantity, even)
  463        )
  464    ->  Codes = Codes0
  465    ;   append(Codes0, [0'\n|Tail], Codes),
  466        read_lines_to_codes(Stream, Tail, Options, odd)
  467    ).
  468
  469check_quotes([], QuoteQuantity, QuoteQuantity) :-
  470    !.
  471check_quotes([0'"|T], odd, Result) :-
  472    !,
  473    check_quotes(T, even, Result).
  474check_quotes([0'"|T], even, Result) :-
  475    !,
  476    check_quotes(T, odd, Result).
  477check_quotes([_|T], QuoteQuantity, Result) :-
  478    check_quotes(T, QuoteQuantity, Result).

csv_options(-Compiled, +Options) is det: Compiled is the compiled representation of the CSV processing options as they may be passed into csv//2, etc. This predicate is used in combination with csv_read_row/3 to avoid repeated processing of the options.

  488csv_options(Compiled, Options) :-
  489    make_csv_options(Options, Compiled, _Ignored).
  490
  491
  492                /*******************************
  493                *             OUTPUT           *
  494                *******************************/

csv_write_file(+File, +Data) is det
csv_write_file(+File, +Data, +Options) is det: Write a list of Prolog terms to a CSV file. Options are given to csv//2. Remaining options are given to open/4. The default separator depends on the file name extension and is \t for .tsv files and , otherwise.

  504csv_write_file(File, Data) :-
  505    csv_write_file(File, Data, []).
  506
  507csv_write_file(File, Data, Options) :-
  508    must_be(list, Data),
  509    default_separator(File, Options, Options1),
  510    make_csv_options(Options1, OptionsRecord, RestOptions),
  511    setup_call_cleanup(
  512        open(File, write, Out, RestOptions),
  513        maplist(csv_write_row(Out, OptionsRecord), Data),
  514        close(Out)).
  515
  516csv_write_row(Out, OptionsRecord, Row) :-
  517    phrase(emit_row(Row, OptionsRecord), String),
  518    format(Out, '~s', [String]).
  519
  520emit_csv([], _) --> [].
  521emit_csv([H|T], Options) -->
  522    emit_row(H, Options),
  523    emit_csv(T, Options).
  524
  525emit_row(Row, Options) -->
  526    { Row =.. [_|Fields] },
  527    emit_fields(Fields, Options),
  528    "\r\n".                                     % RFC 4180 demands \r\n
  529
  530emit_fields([], _) -->
  531    "".
  532emit_fields([H|T], Options) -->
  533    emit_field(H, Options),
  534    (   { T == [] }
  535        ->  []
  536        ;   { csv_options_separator(Options, Sep) },
  537        [Sep],
  538        emit_fields(T, Options)
  539    ).
  540
  541emit_field(H, Options) -->
  542    { (   atom(H)
  543      ->  atom_codes(H, Codes)
  544      ;   string(H)
  545      ->  string_codes(H, Codes)
  546      )
  547    },
  548    !,
  549    (   { needs_quotes(H, Options) }
  550    ->  "\"", emit_string(Codes), "\""
  551    ;   emit_codes(Codes)
  552    ).
  553emit_field([], _) -->
  554    !,
  555    { atom_codes('[]', Codes) },
  556    emit_codes(Codes).
  557emit_field(H, _) -->
  558    { number_codes(H,Codes) },
  559    emit_codes(Codes).
  560
  561needs_quotes(Atom, _) :-
  562    sub_atom(Atom, _, _, _, '"'),
  563    !.
  564needs_quotes(Atom, _) :-
  565    sub_atom(Atom, _, _, _, '\n'),
  566    !.
  567needs_quotes(Atom, _) :-
  568    sub_atom(Atom, _, _, _, '\r'),
  569    !.
  570needs_quotes(Atom, Options) :-
  571    csv_options_separator(Options, Sep),
  572    char_code(Char, Sep),
  573    sub_atom(Atom, _, _, _, Char),
  574    !.
  575
  576emit_string([]) --> "".
  577emit_string([0'"|T]) --> !, "\"\"", emit_string(T).
  578emit_string([H|T]) --> [H], emit_string(T).
  579
  580emit_codes([]) --> "".
  581emit_codes([0'"|T]) --> !, "\"\"", emit_codes(T).
  582emit_codes([H|T]) --> [H], emit_codes(T).

csv_write_stream(+Stream, +Data, +Options) is det

Write the rows in Data to Stream. This is similar to csv_write_file/3, but can deal with data that is produced incrementally. The example below saves all answers from the predicate data/3 to File.

save_data(File) :-
   setup_call_cleanup(
       open(File, write, Out),
       forall(data(C1,C2,C3),
              csv_write_stream(Out, [row(C1,C2,C3)], [])),
       close(Out)),

  601csv_write_stream(Stream, Data, Options) :-
  602    must_be(list, Data),
  603    make_csv_options(Options, OptionsRecord, _),
  604    maplist(csv_write_row(Stream, OptionsRecord), Data)