字符集(数据与字符之间转化)

213cy commented 8 years ago

%% % Character encoding list: % https://www.mathworks.com/help/matlab/ref/fopen.html#btrnibn-1-encodingIn % fopen 的时候可以指定文档的 encoding % 但是matlab 使用以UTF-16的unicode编码存入变量 % 也就是说 matlab 的char 类型的变量是以UTF-16编码的

% ANSI编码 windows os 的特有编码应用规则 % 两个字节 % 其中 0-127 总是? ASCII 码 % 其中 128-65534 是是应用当前地区的编码中国是 chcp 936 == GBK % 可以用 alt + uint16(字符编码) 通过小键盘打出

% 国标编码 GB(国标) GBK(国标扩) % 两个字节表示中国所有字符 % 其中 0-127 是 ASCII 码 % 字符多(时间晚,兼容之前的) > 字符少(时间早) % GB18030 > GBK > GB2312

% Unicode 字符集 % 四个字节表示世界各国说有字符 % 全球统一 ,庞大的空白 % utf-8 基于字节为单位的对Unicode的压缩编码 % utf-16 基于字为单位的对Unicode的压缩编码 % utf-32 四个字节的Unicode编码

% ISO-8859 % Unicode之前的世界通用编码标准,好像要被淘汰了. %%

213cy commented 4 years ago

cc=200:255; cc=1111:1166; % num2hex(cc); % dec2hex(cc); s = char(cc) s2 = char(uint8(cc)) %Valid codes range from 0 to 65535, %where codes 0 through 127 correspond to 7-bit ASCII characters. %The characters that MATLAB? can process (other than 7-bit ASCII characters) %depend upon your current locale setting uint16(s); uint16(s2)

%% cc = 128:200; A=native2unicode(cc,'UTF-8') % Unicode遇到无法表示的字符,会自动将字符换成 ff fd,即65533 all(double(A)==65533)

%% cc = uint16(100:150); native2unicode(cc,'ascii') char(cc) native2unicode(cc,'UTF-8') native2unicode(cc,'UTF-16') native2unicode(cc,'GB2312') native2unicode(cc,'GB 2312') native2unicode(cc,'GB2312-80') % 'GB2312-80' 尤其的与众不同 native2unicode(cc,'GBK') native2unicode(cc,'GB18030 ')

213cy commented 4 years ago

str = '拷' % 字符在matlab中都是两个字节的,所以先转化为uint16的数值,以便使用typecast

% format hex sCode = typecast( swapbytes(uint16('拷')),'uint8') % str 在内存中的样子 %sCode = typecast( swapbytes(cast('拷','uint16')),'uint8'); % dec2hex(sCode) %str dump % format short

sVal = double(sCode)*[256;1] % sVal = uint16('拷') % str 在内存中的值

% % 下面四句结果一致, % % 因为char 和其他两个结果一致所以... % % 说明matlab使用UTF-16 的unicode存储字符(至少我这台机子是这样) char(sVal) native2unicode(str,'UTF-8') native2unicode(sCode,'UTF-8') %这个多了2个字节,所以不是UTF-8 native2unicode(str,'UTF-16') native2unicode(sCode,'UTF-16')

% % 下面说明系统默认设定的native 编码是GBK sCode = unicode2native(str) sCode_n = unicode2native(str,'GBK') sVal = double(sCode_n)*[256;1] % ALT + sVal = str % ALT + 49085 = '拷'

213cy commented 4 years ago

s1='联通' % 用 UTF-16 表示的联通 s2 = unicode2native(s1) % 用GKB 表示的联通 % 要显示这个联通应该用 native2unicode(s2) 或是 native2unicode(s2,'GBK') % windows的记事本里可以直接显示GKB字符,但是matlab 只能显示 unicode 无法显示 GKB % 下面强行按 UTF-8 读取 GKB 表示的联通.得到的字符转化成unicode表示出来 s3 = native2unicode(s2,'UTF-8') % 但是这里有一个字符并不是记事本里看到的那个乱码 % dump s3 typecast( swapbytes(uint16(s3)),'uint8') % 可以看到里面有一个 ff fd 这个是Unicode遇到无法表示的字符,自动换的, % 例如如下 UTF-8 编码下的第128:150的字符转成UTF-16 得到 s4 = native2unicode(128:134,'UTF-8'); typecast( swapbytes(uint16(s4)),'uint8') % 以UTF-8的编码方式表示这些占位符号 s5 = unicode2native(s4,'UTF-8') % 再以GBK的编码方式解释新得到字符,并转换为unicode编码方式并显示 s6 = native2unicode(s5) %% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %% s = '依次生成古文派符号派拼音派和方块派(方块在工作空间里显示) 的乱码' native2unicode( unicode2native(s,'UTF-8') ,'GBK') native2unicode( unicode2native(s,'UTF-8') ,'ISO-8859-1') native2unicode( unicode2native(s,'GBK') ,'ISO-8859-1' ) native2unicode( unicode2native(s,'GBK') ,'UTF-8' ) %方块在command window里显示为空格,在workspace 里显示为方块

cyfile / Matlab-base-toolbox

字符集(数据与字符之间转化) #1