リンク: [ホーム] [自己紹介] [リンク集] [アルバム] [ソフトウェア] [発表文献] [その他]

まさおのChangeLogメモ / 2006-06-05

01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30

2006-06-05 Mon

* syck+utf-8+escape

[2006-05-31]で調べた、YAML形式のファイル中でUTF-8文字列のうち、
ASCII範囲外がのきなみエスケープされて出力されてしまう問題。

手元が簡単なパッチを書いたら、直るようなので、メモしておく:

Index: ext/syck/emitter.c
===================================================================
RCS file: /src/ruby/ext/syck/emitter.c,v
retrieving revision 1.16
diff -u -b -r1.16 emitter.c
--- ext/syck/emitter.c 2 Feb 2006 15:02:49 -0000 1.16
+++ ext/syck/emitter.c 4 Jun 2006 19:53:26 -0000
@@ -776,25 +776,56 @@
 syck_emitter_escape( SyckEmitter *e, char *src, long len )
 {
     int i;
+ unsigned int c1, c2, c3;
     for( i = 0; i < len; i++ )
     {
- if( (src[i] < 0x20) || (0x7E < src[i]) )
+ c1 = src[i];
+ if (i + 1 < len)
+ c2 = src[i+1];
+ if (i + 2 < len)
+ c3 = src[i+2];
+
+ if( (0x20 <= c1) && (c1 <= 0x7E) )
         {
+ syck_emitter_write( e, src + i, 1 );
+ if( '\\' == c1 )
             syck_emitter_write( e, "\\", 1 );
- if( '\0' == src[i] )
- syck_emitter_write( e, "0", 1 );
- else
- {
- syck_emitter_write( e, "x", 1 );
- syck_emitter_write( e, (char *)hex_table + ((src[i] & 0xF0) >> 4), 1 );
- syck_emitter_write( e, (char *)hex_table + (src[i] & 0x0F), 1 );
             }
+ else if ( ((i + 1) < len) && /* UTF-8 - 2 bytes */
+ (0xc2 <= c1) && (c1 <= 0xdf) &&
+ (0x80 <= c2) && (c2 <= 0xbf) )
+ {
+ syck_emitter_write( e, src + i, 2 );
+ i++;
+ }
+ else if ( ((i + 2) < len) && /* UTF-8 - 3 bytes */
+ ( (0xe0 == c1) &&
+ (0xa0 <= c2) && (c2 <= 0xbf) &&
+ (0x80 <= c3) && (c3 <= 0xbf) ) ||
+ ( (0xe1 <= c1) && (c1 <= 0xec) &&
+ (0x80 <= c2) && (c2 <= 0xbf) &&
+ (0x80 <= c3) && (c3 <= 0xbf) ) ||
+ ( (0xed == c1) &&
+ (0x80 <= c2) && (c2 <= 0x9f) &&
+ (0x80 <= c3) && (c3 <= 0xbf) ) ||
+ ( (0xee <= c1) && (c1 <= 0xef) &&
+ (0x80 <= c2) && (c2 <= 0xbf) &&
+ (0x80 <= c3) && (c3 <= 0xbf) ) )
+ {
+ syck_emitter_write( e, src + i, 3 );
+ i += 2;
         }
         else
         {
- syck_emitter_write( e, src + i, 1 );
- if( '\\' == src[i] )
                 syck_emitter_write( e, "\\", 1 );
+ if( '\0' == c1 )
+ syck_emitter_write( e, "0", 1 );
+ else
+ {
+ syck_emitter_write( e, "x", 1 );
+ syck_emitter_write( e, (char *)hex_table + ((c1 & 0xF0) >> 4), 1 );
+ syck_emitter_write( e, (char *)hex_table + (c1 & 0x0F), 1 );
+ }
         }
     }
 }
@@ -849,12 +880,13 @@
 /*
  * Outputs a double-quoted block.
  */
-void syck_emit_2quoted( SyckEmitter *e, int width, char *str, long len )
+void syck_emit_2quoted( SyckEmitter *e, int width, unsigned char *str, long len )
 {
     char do_indent = 0;
- char *mark = str;
- char *start = str;
- char *end = str;
+ unsigned char *mark = str;
+ unsigned char *start = str;
+ unsigned char *end = str;
+ unsigned int c, c2, c3;
     syck_emitter_write( e, "\"", 1 );
     while ( mark < str + len ) {
         if ( do_indent > 0 ) {
@@ -864,21 +896,34 @@
             syck_emit_indent( e );
             do_indent = 0;
         }
- switch ( *mark ) {
-
- /* Escape sequences allowed within double quotes. */
- case '"': syck_emitter_write( e, "\\\"", 2 ); break;
- case '\\': syck_emitter_write( e, "\\\\", 2 ); break;
- case '\0': syck_emitter_write( e, "\\0", 2 ); break;
- case '\a': syck_emitter_write( e, "\\a", 2 ); break;
- case '\b': syck_emitter_write( e, "\\b", 2 ); break;
- case '\f': syck_emitter_write( e, "\\f", 2 ); break;
- case '\r': syck_emitter_write( e, "\\r", 2 ); break;
- case '\t': syck_emitter_write( e, "\\t", 2 ); break;
- case '\v': syck_emitter_write( e, "\\v", 2 ); break;
- case 0x1b: syck_emitter_write( e, "\\e", 2 ); break;
 
- case '\n':
+ c = *mark;
+ if ( mark + 1 < str + len)
+ c2 = *(mark + 1);
+ if ( mark + 2 < str + len)
+ c3 = *(mark + 2);
+ if (c == '"') /* Escape sequences allowed within double quotes. */
+ syck_emitter_write( e, "\\\"", 2 );
+ else if (c == '\\')
+ syck_emitter_write( e, "\\\\", 2 );
+ else if (c == '\0')
+ syck_emitter_write( e, "\\0", 2 );
+ else if (c == '\a')
+ syck_emitter_write( e, "\\a", 2 );
+ else if (c == '\b')
+ syck_emitter_write( e, "\\b", 2 );
+ else if (c == '\f')
+ syck_emitter_write( e, "\\f", 2 );
+ else if (c == '\r')
+ syck_emitter_write( e, "\\r", 2 );
+ else if (c == '\t')
+ syck_emitter_write( e, "\\t", 2 );
+ else if (c == '\v')
+ syck_emitter_write( e, "\\v", 2 );
+ else if (c == 0x1b)
+ syck_emitter_write( e, "\\e", 2 );
+ else if (c == '\n')
+ {
                 end = mark + 1;
                 syck_emitter_write( e, "\\n", 2 );
                 do_indent = 2;
@@ -886,20 +931,47 @@
                 if ( start < str + len && ( *start == ' ' || *start == '\n' ) ) {
                     do_indent = 0;
                 }
- break;
-
- case ' ':
+ }
+ else if (c == ' ')
+ {
                 if ( width > 0 && *start != ' ' && mark - end > width ) {
                     do_indent = 1;
                     end = mark + 1;
                 } else {
                     syck_emitter_write( e, " ", 1 );
                 }
- break;
-
- default:
+ }
+ else if ( (0x20 <= c) && (c <= 0x7E) )
+ {
+ syck_emitter_write( e, mark, 1 );
+ }
+ else if ( (mark + 1) < (str + len) && /* UTF-8 - 2 bytes */
+ (0xC2 <= c) && (c <= 0xDF) &&
+ (0x80 <= c2) && (c2 <= 0xBF) )
+ {
+ syck_emitter_write( e, mark, 2 );
+ mark++;
+ }
+ else if ( (mark + 2) < (str + len) && /* UTF-8 - 3 bytes */
+ ( (0xE0 == c) &&
+ (0xA0 <= c2) && (c2 <= 0xBF) &&
+ (0x80 <= c3) && (c3 <= 0xBF) ) ||
+ ( (0xE1 <= c) && (c <= 0xEC) &&
+ (0x80 <= c2) && (c2 <= 0xBF) &&
+ (0x80 <= c3) && (c3 <= 0xBF) ) ||
+ ( (0xED == c) &&
+ (0x80 <= c2) && (c2 <= 0x9F) &&
+ (0x80 <= c3) && (c3 <= 0xBF) ) ||
+ ( (0xEE <= c) && (c <= 0xEF) &&
+ (0x80 <= c2) && (c2 <= 0xBF) &&
+ (0x80 <= c3) && (c3 <= 0xBF) ) )
+ {
+ syck_emitter_write( e, mark, 3 );
+ mark += 2;
+ }
+ else
+ {
                 syck_emitter_escape( e, mark, 1 );
- break;
         }
         mark++;
     }
Index: ext/syck/syck.h
===================================================================
RCS file: /src/ruby/ext/syck/syck.h,v
retrieving revision 1.30
diff -u -b -r1.30 syck.h
--- ext/syck/syck.h 19 Dec 2005 14:13:27 -0000 1.30
+++ ext/syck/syck.h 4 Jun 2006 19:53:26 -0000
@@ -376,7 +376,7 @@
 void syck_emit( SyckEmitter *, st_data_t );
 void syck_emit_scalar( SyckEmitter *, char *, enum scalar_style, int, int, char, char *, long );
 void syck_emit_1quoted( SyckEmitter *, int, char *, long );
-void syck_emit_2quoted( SyckEmitter *, int, char *, long );
+void syck_emit_2quoted( SyckEmitter *, int, unsigned char *, long );
 void syck_emit_folded( SyckEmitter *, int, char, char *, long );
 void syck_emit_literal( SyckEmitter *, char, char *, long );
 void syck_emit_seq( SyckEmitter *, char *, enum seq_style );
Index: test/yaml/test_yaml.rb
===================================================================
RCS file: /src/ruby/test/yaml/test_yaml.rb,v
retrieving revision 1.18
diff -u -b -r1.18 test_yaml.rb
--- test/yaml/test_yaml.rb 16 Jan 2006 01:28:52 -0000 1.18
+++ test/yaml/test_yaml.rb 4 Jun 2006 19:53:27 -0000
@@ -1272,6 +1272,11 @@
       assert_equal([{}], o.keys)
     end
 
+ def test_toyaml_rawutf8_string
+ # Japanese Hiragana: "AIUEO"
+ assert_equal("--- \"abcdefghijklmnopqrstuvwxyzあいうえお\"\n",
+ "abcdefghijklmnopqrstuvwxyzあいうえお".to_yaml)
+ end
 end
 
 if $0 == __FILE__


CVS HEADに対するもの。

見れば分かる通り、正当なUTF-8の範囲のバイト列に対してはエスケープ
しないよう回避しているだけ。

暇になったらバグ報告する予定。

* SPSSからアウトライングラフ生成

SPSSの生成するグラフは、そのままコピー&ペーストすると、ビットマッ
プ形式になってしまい、拡大縮小すると見るに耐えないほど汚なくなって
しまうが、一旦Word/RTF形式にエクスポートすると、アウトライン形式の
画像が得られる。

バッドノウハウ的なのでメモしとく…。