许久没做erlang开发了,最近有网友问到erlang的问题,就抽时间看下。问题是这样的,模块有中文,将中文直接打印出来,shell下显示会出现乱码,但如果先将中文转成binary,就可以正常显示出来。
shell中文乱码问题
这里以一个简单的例子,说明下:
-module(m). -compile(export_all). test() -> io:format("~ts~n", ["中国"]), io:format("~ts~n", [list_to_binary("中国")]).编译,然后测试下结果:
Eshell V5.10.3 (abort with ^G) 1> c(m). {ok, m} 2> m:test(). ??-??? 中国 ok
{function, test, 0, 2}. {label,1}. {line,[{location,"erl.erl",4}]}. {func_info,{atom,erl},{atom,test},0}. {label,2}. {allocate,0,0}. {move,{literal,[[228,184,173,229,155,189]]},{x,1}}. {move,{literal,"~ts~n"},{x,0}}. {line,[{location,"erl.erl",5}]}. {call_ext,2,{extfunc,io,format,2}}. {move,{literal,[<<228,184,173,229,155,189>>]},{x,1}}. {move,{literal,"~ts~n"},{x,0}}. {line,[{location,"erl.erl",6}]}. {call_ext_last,2,{extfunc,io,format,2},0}.实际上,erlang会做优化,list_to_binary在编译期就被优化掉了。
%% io_lib.erl format(Format, Args) -> case catch io_lib_format:fwrite(Format, Args) of {‘EXIT‘,_} -> erlang:error(badarg, [Format, Args]); Other -> Other end.实现代码在 io_lib_format模块,如下:
%% io_lib_format.erl fwrite(Format, Args) when is_atom(Format) -> fwrite(atom_to_list(Format), Args); fwrite(Format, Args) when is_binary(Format) -> fwrite(binary_to_list(Format), Args); fwrite(Format, Args) -> Cs = collect(Format, Args), %% 收集格式化信息,生成控制结构 Pc = pcount(Cs), %% 计算请求打印的数量 build(Cs, Pc, 0). %% 解析控制结构,生成数据 collect([$~|Fmt0], Args0) -> %% 格式化参数以 ~打头,否则忽略 {C,Fmt1,Args1} = collect_cseq(Fmt0, Args0), [C|collect(Fmt1, Args1)]; collect([C|Fmt], Args) -> [C|collect(Fmt, Args)]; collect([], []) -> []. collect_cseq(Fmt0, Args0) -> {F,Ad,Fmt1,Args1} = field_width(Fmt0, Args0), {P,Fmt2,Args2} = precision(Fmt1, Args1), {Pad,Fmt3,Args3} = pad_char(Fmt2, Args2), {Encoding,Fmt4,Args4} = encoding(Fmt3, Args3), {Strings,Fmt5,Args5} = strings(Fmt4, Args4), {C,As,Fmt6,Args6} = collect_cc(Fmt5, Args5), {{C,As,F,Ad,P,Pad,Encoding,Strings},Fmt6,Args6}. %% 检查format 参数含有 t, 然后打标记 unicode,其他记latin1 encoding([$t|Fmt],Args) -> true = hd(Fmt) =/= $l, %% 确保不是传入 ~tl {unicode,Fmt,Args}; encoding(Fmt,Args) -> {latin1,Fmt,Args}.再看下以上build部分的代码。代码过长,做了删节:
%% io_lib_format.erl build([{C,As,F,Ad,P,Pad,Enc,Str}|Cs], Pc0, I) -> S = control(C, As, F, Ad, P, Pad, Enc, Str, I), %% 处理控制结构 Pc1 = decr_pc(C, Pc0), if Pc1 > 0 -> [S|build(Cs, Pc1, indentation(S, I))]; true -> [S|build(Cs, Pc1, I)] end; build([$\n|Cs], Pc, _I) -> [$\n|build(Cs, Pc, 0)]; build([$\t|Cs], Pc, I) -> [$\t|build(Cs, Pc, ((I + 8) div 8) * 8)]; build([C|Cs], Pc, I) -> [C|build(Cs, Pc, I+1)]; build([], _Pc, _I) -> []. control($w, [A], F, Adj, P, Pad, _Enc, _Str, _I) -> term(io_lib:write(A, -1), F, Adj, P, Pad); control($p, [A], F, Adj, P, Pad, Enc, Str, I) -> print(A, -1, F, Adj, P, Pad, Enc, Str, I); control($W, [A,Depth], F, Adj, P, Pad, _Enc, _Str, _I) when is_integer(Depth) -> term(io_lib:write(A, Depth), F, Adj, P, Pad); control($P, [A,Depth], F, Adj, P, Pad, Enc, Str, I) when is_integer(Depth) -> print(A, Depth, F, Adj, P, Pad, Enc, Str, I); control($s, [A], F, Adj, P, Pad, _Enc, _Str, _I) when is_atom(A) -> string(atom_to_list(A), F, Adj, P, Pad); control($s, [L0], F, Adj, P, Pad, latin1, _Str, _I) -> %% 处理 ~s,如果数据标记是 latin1 L = iolist_to_chars(L0), string(L, F, Adj, P, Pad); control($s, [L0], F, Adj, P, Pad, unicode, _Str, _I) -> %% 处理 ~s,如果数据标记是 unicode L = cdata_to_chars(L0), uniconv(string(L, F, Adj, P, Pad)); control($e, [A], F, Adj, P, Pad, _Enc, _Str, _I) when is_float(A) -> %% 该函数太长了,不是讨论重点,做了删节 cdata_to_chars([C|Cs]) when is_integer(C), C >= $\000 -> [C | cdata_to_chars(Cs)]; cdata_to_chars([I|Cs]) -> [cdata_to_chars(I) | cdata_to_chars(Cs)]; cdata_to_chars([]) -> []; cdata_to_chars(B) when is_binary(B) -> %% 如果数据是binary,做一下unicode转换 case catch unicode:characters_to_list(B) of L when is_list(L) -> L; _ -> binary_to_list(B) end.可想而知,如果没有不是 ~ts,或者不是binary,都不会做转换。
原文:http://blog.csdn.net/mycwq/article/details/50762572