-module(xml_tokenise).
%% -compile(export_all).
%% xml_tokenise:str2tags("
%%
%% ]>
%%
%% data
%% ") =>
%%[{tag,Ln,"!DOCTYPE ...",
%% {tag,Ln,"abc"},
%% {raw,Ln,"\ndata\n"},
%% {tag,Lb,"abc"}].
%%
%% str2tags breaks down the top level structure into a sequence of tagged
%% object.
%% tagbody2toks breaks down the "inner" structure of a tag
%%
%% xml_tokenise:tagbody2toks(Line, "!ENTITY % abc 123") =>
%% ["ENTITY", '%' "123"]
%%
%% xml_tokenise:tagbody2toks("!DOCTYPE abc [
%% ['!', "DOCTYPE", "abc", '[', {tag,"!ENTITY"}, ...]
-export([collect_entity/1, collect_name/1, file/1, stag/1, str2tags/1,
tagbody2toks/2, tupStr/1]).
-import(lists, [foreach/2, map/2, reverse/1, reverse/2]).
-import(xml, [fatal/1,fatal/2,warning/2]).
file(File) ->
{ok, B} = file:read_file(File ++ ".xml"),
str2tags(binary_to_list(B)).
test() ->
Str = test_data(),
test(Str).
test(Str) ->
case str2tags(Str) of
{ok, Tags} ->
io:format("str2tags(\"~s\") => ~p~n", [Str, Tags]),
foreach(fun({tag,Line,Str1}) ->
R = tagbody2toks(Line,Str1),
io:format("tagbody2tags(~w,\"~s\") => ~p~n",
[Line,Str1, R]);
(_) -> true
end, Tags);
Other ->
io:format("str2tags(\"~s\") => ~p~n", [Str, Other])
end.
test_data() ->
"
]>
data
".
str2tags(L) ->
case str2tags(L, new_env(), []) of
{{_,0}, Toks} ->
{ok, Toks};
{{_, N}, Toks} ->
errors
end.
str2tags([$<|T], Env, Toks) ->
%% io:format("getting a tag:~s~n", [[$<|T]]),
{Tag, T1, Env1} = collect_tag([$<|T], Env),
%% io:format("tag is:~p~nT1:|~s|~n", [Tag, T1]),
str2tags(T1, Env1, [Tag|Toks]);
str2tags([H|T], Env, Toks) ->
%% io:format("getting raw data:~s~n", [[H|T]]),
{Raw, T1, Env1} = collect_raw([H|T], Env, []),
%% io:format("raw is:~p~n:T1:~s~n", [Raw,T1]),
str2tags(T1, Env1, [{raw_tag(Raw), lineno(Env), Raw}|Toks]);
str2tags([], Env, Toks) ->
{Env, reverse(Toks)}.
raw_tag(Str) ->
case all_spaces(Str) of
true -> space;
false -> raw
end.
all_spaces([H|T]) ->
case is_Space(H) of
true -> all_spaces(T);
false -> false
end;
all_spaces([]) ->
true.
tagbody2toks(Line, T) ->
tagbody2toks(T, {Line, 0}, []).
tagbody2toks([$"|T], E, Toks) ->
{Str, T1, E1} = collect_literal($", lineno(E), T, E, []),
tagbody2toks(T1, E1, [{lit, lineno(E), Str}|Toks]);
tagbody2toks([$'|T], E, Toks) ->
{Str, T1, E1} = collect_literal($', lineno(E), T, E, []),
tagbody2toks(T1, E1, [{lit, lineno(E), Str}|Toks]);
tagbody2toks([$<|T], E, Toks) ->
{Tag, T1, Env1} = collect_tag([$<|T], E),
tagbody2toks(T1, Env1, [Tag|Toks]);
tagbody2toks([$>|T], E, Toks) ->
tagbody2toks(T, E, [$>|Toks]);
tagbody2toks([$ ,$-,$-|T], E, Toks) ->
{Str, T1, E1} = collect_old_style_comment(T, E, []),
warning(lineno(E), ["old style comment dropped " ++ Str]),
tagbody2toks(T1, E1, Toks);
tagbody2toks([$%|T], E, Toks) ->
case collect_entity([$%|T], E) of
fail ->
tagbody2toks(T, E, ['%'|Toks]);
{Entity, T1, E1} ->
tagbody2toks(T1, E1, [Entity|Toks])
end;
tagbody2toks([H|T], E, Toks) ->
case is_Space(H) of
true ->
{T1, E1} = skip_space([H|T], E),
tagbody2toks(T1, E1, Toks);
false ->
case is_Name(H) of
true ->
{Str, T1, E1} = collect_name([H|T], E),
tagbody2toks(T1, E1, [Str|Toks]);
false ->
tagbody2toks(T, E, [list_to_atom([H])|Toks])
end
end;
tagbody2toks([], E, Toks) ->
reverse(Toks).
collect_literal(Stop, StartLn, [Stop|T], E, L) ->
{reverse(L), T, E};
collect_literal(Stop, StartLn, [H|T], E, L) ->
collect_literal(Stop, StartLn, T, bump_line(H, E), [H|L]);
collect_literal(Stop, StartLn, [], E, L) ->
literal_error(StartLn, L, E).
literal_error(Line, Str, E) ->
context(reverse(Str)),
eof_error("Literal which started in line " ++ integer_to_list(Line),E).
context(Str) ->
io:format("** context ** [~s]~n", [lists:sublist(Str, 20)]).
collect_name(Str) ->
case collect_name(Str, {1,0}) of
{Name, T, {_,0}} ->
{Name, T};
_ ->
fail
end.
collect_name([H|T], E) ->
case is_Name(H) of
true ->
{Str, T1, E1} = collect_while(fun is_NameChar/1, T, E),
{[H|Str], T1, E1};
false ->
{[], [H|T], E}
end;
collect_name([], E) ->
{[], [], E}.
stag([$?|T]) ->
{Str, T1, _} = collect_name(T, {0,0}),
[$?|tupStr(Str)];
stag([$!|T]) ->
{Str, T1, _} = collect_name(T, {0,0}),
[$!|tupStr(Str)];
stag(T) ->
{Str, T1, _} = collect_name(T, {0,0}),
tupStr(Str).
%% Character predictes
%% [3] MiscName ::= '.' | '-' | '_' | ':' | CombiningChar | Ignorable |
%% Extender
%% [4] NameChar ::= Letter | Digit | MiscName
%% [5] Name ::= (Letter | '_' | ':') (NameChar)*
%% [6] Names ::= Name (S Name)*
%% [7] Nmtoken ::= (NameChar)+
%% [8] Nmtokens ::= Nmtoken (S Nmtoken)*
%% CombiningChar and Ignorable and Extender have been ignored
is_MiscName($.) -> true;
is_MiscName($-) -> true;
is_MiscName($_) -> true;
is_MiscName($:) -> true;
is_MiscName(_) -> false.
is_NameChar(X) ->
case is_Letter(X) of
true -> true;
false -> case is_Digit(X) of
true -> true;
false -> is_MiscName(X)
end
end.
is_Name($:) -> true;
is_Name($_) -> true;
is_Name(H) -> is_Letter(H).
is_Space($ ) -> true;
is_Space($\t) -> true;
is_Space($\n) -> true;
is_Space(_) -> false.
is_SimpleDataChar($<) -> false;
is_SimpleDataChar($&) -> false;
is_SimpleDataChar(_) -> true.
is_Letter(X) when $a =< X, X =< $z -> true;
is_Letter(X) when $A =< X, X =< $Z -> true;
is_Letter(_) -> false.
is_Digit(X) when $0 =< X, X =< $9 -> true;
is_Digit(_) -> false.
is_HexDigit(X) when X =< $0, X =< $9 -> true;
is_HexDigit(X) when X =< $a, X =< $z -> true;
is_HexDigit(X) when X =< $A, X =< $Z -> true;
is_HexDigit(_) -> false.
%% collect_tag(T, E) -> {Str, T', E'}
%% This collects all the stuff inside a tag as a single string
%% This is called at the top level
collect_tag([$<,$!,$-,$-|T], E) ->
{Str, T1, E1} = collect_comment(T, E, []),
{{comment, lineno(E), Str}, T1, E1};
collect_tag([$<,$!,$[,$C,$D,$A,$T,$A,$[|T], E) ->
{Str, T1, E1} = collect_cdata(T, E, []),
{{cdata,Str}, T1, E1};
collect_tag([$<|T], E) ->
{Str, T1, E1} = collect_tag_args(T, 1, E, []),
{{tag,lineno(E), Str}, T1, E1}.
collect_tag_args([$"|T], Level, E, L) ->
{Str, T1, E1} = collect_literal($", lineno(E), T, E, [$"]),
collect_tag_args(T1, Level, E1, [$"|reverse(Str, L)]);
collect_tag_args([$'|T], Level, E, L) ->
{Str, T1, E1} = collect_literal($', lineno(E), T, E, [$']),
collect_tag_args(T1, Level, E1, [$'|reverse(Str, L)]);
collect_tag_args([$<|T], Level, E, L) ->
collect_tag_args(T, Level+1, E, [$<|L]);
collect_tag_args([$>|T], 1, E, L) ->
{reverse(L), T, E};
collect_tag_args([$>|T], Level, E, L) ->
collect_tag_args(T, Level-1, E, [$>|L]);
collect_tag_args([$ ,$-,$-|T], Level, E, L) ->
{Str, T1, E1} = collect_old_style_comment(T, E, []),
warning(lineno(E), ["old style comment dropped " ++ Str]),
collect_tag_args(T1, Level, E1, L);
collect_tag_args([H|T], Level, E, L) ->
collect_tag_args(T, Level, bump_line(H, E), [H|L]);
collect_tag_args([], Level, E, L) ->
eof_error("in a tag", E).
collect_old_style_comment([$-,$-|T], E, L) ->
{reverse(L), T, E};
collect_old_style_comment([H|T], E, L) ->
collect_old_style_comment(T, bump_line(H, E), [H|L]);
collect_old_style_comment([], E, L) ->
eof_error("Old_Style_Comment", E).
collect_comment([$-,$-,$>|T], E, L) ->
{reverse(L), T, E};
collect_comment([$-,$-,H|T], E, L) ->
io:format("*** warning \"--\" in comment~n", []),
collect_comment(T, E, [$-,$-|L]);
collect_comment([H|T], E, L) ->
collect_comment(T, bump_line(H, E), [H|L]);
collect_comment([], E, L) ->
eof_error("Comment", E).
collect_cdata([$],$]|T], E, L) ->
{reverse(L), T, E};
collect_cdata([H|T], E, L) ->
collect_cdata(T, bump_line(H, E), [H|L]);
collect_cdata([], E, L) ->
eof_error("CDATA", E).
collect_raw([$<|T], E, L) ->
{reverse(L), [$<|T], E};
collect_raw([H|T], E, L) ->
collect_raw(T, bump_line(H, E), [H|L]);
collect_raw([], E, L) ->
{reverse(L), [], E}.
collect_string([$<|T], E, L) ->
{reverse(L), [$<|T], E};
collect_string([$&|T], E, L) ->
{reverse(L), [$&|T], E};
collect_string([H|T], E, L) ->
collect_string(T, bump_line(H, E), [H|L]);
collect_string([], E, L) ->
{reverse(L), [], E}.
collect_entity(Str) ->
case collect_entity(Str, new_env()) of
{Entity, Str1, _} ->
{Entity, Str1};
fail ->
fail
end.
collect_entity([$%|T], E) ->
case collect_name(T, E) of
{Str, [$;|T1], E1} ->
{{pent,Str}, T1, E1};
_ ->
fail
end;
collect_entity([$&,$#|T], E) ->
case collect_while(fun is_Digit/1, T, E) of
{Str, [$;|T1], E1} ->
{{charent, list_to_integer(Str)}, T1, E1};
_ ->
fail
end;
collect_entity([$&|T], E) ->
case collect_name(T, E) of
{Str, [$;|T1], E1} ->
{{gent, Str}, T1, E1};
_ ->
fail
end;
collect_entity(T, E) ->
fail.
eof_error(Where, E) ->
fatal(lineno(E), ["*** Unexpected EOF in ", Where]),
{"", [], bump_errors(E)}.
new_env() -> {1,0}.
bump_line($\n, {N,M}) -> {N+1, M};
bump_line(_, X) -> X.
bump_errors({N,M}) -> {N, M+1}.
lineno({N,M}) -> N.
skip_space([], E) -> {[], E};
skip_space([H|T], E) ->
case is_Space(H) of
true -> skip_space(T, bump_line(H, E));
false -> {[H|T], E}
end.
+type collect_while((X) -> bool(), [X], env()) -> {[X], [X], env()}.
collect_while(Fun, T, E) -> collect_while(Fun, T, [], E).
collect_while(Fun, [], L, E) ->
{reverse(L), [], E};
collect_while(Fun, [H|T], L, E) ->
case Fun(H) of
true ->
collect_while(Fun, T, [H|L], bump_line(H, E));
false ->
{reverse(L), [H|T], E}
end.
tupStr(Str) when list(Str) -> map(fun tupChar/1, Str);
tupStr(Other) -> fatal(["tupStr bad arg",Other]),"error".
tupChar(X) when $a =< X, X =< $z -> X - $a + $A;
tupChar(X) -> X.