HTTrack Website Copier
Free software offline browser - FORUM
Subject: Re: Can't read the web site that writen by unicode
Author: Xavier Roche
Date: 12/03/2005 14:12
 
And here's the diff:

diff -rudb httrack-3.39.53.orig/src/htscore.c httrack-3.39.53/src/htscore.c
--- httrack-3.39.53.orig/src/htscore.c	2005-12-03 14:11:28.875560939 +0100
+++ httrack-3.39.53/src/htscore.c	2005-12-03 14:08:33.000000000 +0100
@@ -1118,7 +1118,7 @@
                 nspec += map[i];
               }
             }
-            /* On-the-fly UCS2 to ISO-8859-1 conversion (note: UCS2 should
never be used on the net) */
+            /* On-the-fly UCS2 to UTF-8 conversion (note: UCS2 should never
be used on the net) */
             if (
               map[0] > r.size/10
               &&
@@ -1129,30 +1129,101 @@
               ||
               ( ((unsigned char) r.adr[0]) == 0xfe && ((unsigned char)
r.adr[1]) == 0xff)
               )
-              ) {
-              int lost=0;
+							) 
+						{
+#define CH_ADD(c) do {															\
+	if (new_offs + 1 > new_capa) {										\
+		new_capa *= 2;																	\
+		new_adr = (unsigned char*) realloct(new_adr,    \
+		                                    new_capa); 	\
+		assertf(new_adr != NULL);												\
+	}																									\
+	new_adr[new_offs++] = (unsigned char) (c);        \
+} while(0)
+#define CH_ADD_RNG1(c, r, o) do {                   \
+	CH_ADD( (c) / (r) + (o) );                        \
+	c = (c) % (r);                                    \
+} while(0)
+#define CH_ADD_RNG0(c, o) do {                      \
+	CH_ADD_RNG1(c, 1, o); 	 													\
+} while(0)
+#define CH_ADD_RNG2(c, r, r2, o) do {               \
+	CH_ADD_RNG1(c, (r) * (r2), o);	 									\
+} while(0)
+							int new_capa = r.size / 2 + 1;
+							int new_offs = 0;
+							unsigned char* prev_adr = (unsigned char*) r.adr;
+							unsigned char* new_adr = (unsigned char*) malloct(new_capa);
               int i;
               int swap = (((unsigned char)r.adr[0]) == 0xff);
-              for(i = 0 ; i < r.size / 2 - 1 ; i++) {
-                unsigned int unic = 0;
+							assertf(new_adr != NULL);
+							/* 
+							See <http://www.unicode.org/reports/tr28/tr28-3.html#conformance> 
+							U+0000..U+007F 00..7F       
+							U+0080..U+07FF C2..DF  80..BF      
+							U+0800..U+0FFF E0      A0..BF  80..BF    
+							U+1000..U+CFFF E1..EC  80..BF  80..BF    
+							U+D000..U+D7FF ED      80..9F  80..BF    
+							U+D800..U+DFFF
+							U+E000..U+FFFF EE..EF  80..BF  80..BF    
+							*/
+							for(i = 0 ; i < r.size / 2 ; i++) {
+								unsigned short int unic = 0;
                 if (swap)
-                  unic = (r.adr[i*2 + 2] << 8) + r.adr[i*2 + 2 + 1];
+									unic = prev_adr[i*2] + (prev_adr[i*2 + 1] << 8);
                 else
-                  unic = r.adr[i*2 + 2] + (r.adr[i*2 + 2 + 1] << 8);
-                if (unic <= 255)
-                  r.adr[i] = (char) unic;
-                else {
-                  r.adr[i] = '?';
-                  lost++;
+									unic = (prev_adr[i*2] << 8) + prev_adr[i*2 + 1];
+								if (unic <= 0x7F) {
+									/* U+0000..U+007F 00..7F      */
+									CH_ADD_RNG0( unic,               0x00 );
+								} else if (unic <= 0x07FF) {
+									/* U+0080..U+07FF C2..DF  80..BF */
+									unic -= 0x0080;
+									CH_ADD_RNG1( unic, 0xbf - 0x80 + 1, 0xc2 );
+									CH_ADD_RNG0( unic,                  0x80 );
+								} else if (unic <= 0x0FFF) {
+									/* U+0800..U+0FFF E0      A0..BF  80..BF */
+									unic -= 0x0800;
+									CH_ADD_RNG2( unic, 0xbf - 0x80 + 1, 0xbf - 0xa0 + 1, 0xe0 );
+									CH_ADD_RNG1( unic, 0xbf - 0x80 + 1, 0xa0 );
+									CH_ADD_RNG0( unic,                  0x80 );
+								} else if (unic <= 0xCFFF) {
+									/* U+1000..U+CFFF E1..EC  80..BF  80..BF */
+									unic -= 0x1000;
+									CH_ADD_RNG2( unic, 0xbf - 0x80 + 1, 0xbf - 0x80 + 1, 0xe1 );
+									CH_ADD_RNG1( unic, 0xbf - 0x80 + 1, 0x80 );
+									CH_ADD_RNG0( unic,                  0x80 );
+								} else if (unic <= 0xD7FF) {
+									/* U+D000..U+D7FF ED      80..9F  80..BF */
+									unic -= 0xD000;
+									CH_ADD_RNG2( unic, 0xbf - 0x80 + 1, 0x9f - 0x80 + 1, 0xed );
+									CH_ADD_RNG1( unic, 0xbf - 0x80 + 1, 0x80 );
+									CH_ADD_RNG0( unic,                  0x80 );
+								} else if (unic <= 0xDFFF) {
+									/* U+D800..U+DFFF */
+									CH_ADD('?');
+									/* ill-formed */
+								} else if (unic <= 0xFFFF) {
+									/* U+E000..U+FFFF EE..EF  80..BF  80..BF */
+									unic -= 0xE000;
+									CH_ADD_RNG2( unic, 0xbf - 0x80 + 1, 0xbf - 0x80 + 1, 0xee );
+									CH_ADD_RNG1( unic, 0xbf - 0x80 + 1, 0x80 );
+									CH_ADD_RNG0( unic,                  0x80 );
                 }
               }
-              r.size = r.size / 2 - 1;
-              r.adr[r.size] = '\0';
-
               if (opt.errlog) {
-                fspc(opt.errlog,"warning"); fprintf(opt.errlog,"File %s%s
converted from UCS2 to 8-bit, %d characters lost during conversion (better to
use UTF-8)"LF, urladr, urlfil, lost);
+								fspc(opt.errlog,"warning"); fprintf(opt.errlog,"File %s%s converted
from UCS2 to UTF-8 (old size: %d bytes, new size: %d bytes)"LF, urladr,
urlfil, (int)r.size, new_offs);
                 test_flush;
               }
+							freet(r.adr);
+							r.adr = NULL;
+							r.size = new_offs;
+							CH_ADD(0);
+							r.adr = (char*) new_adr;
+#undef CH_ADD
+#undef CH_ADD_RNG0
+#undef CH_ADD_RNG1
+#undef CH_ADD_RNG2
             } else if ((nspec > r.size / 100) && (nspec > 10)) {    // too
many special characters
               strcpybuff(r.contenttype,"application/octet-stream");
               if (opt.errlog) {
 
Reply Create subthread


All articles

Subject Author Date
Re: Can't read the web site that writen by unicode

12/03/2005 12:17
Re: Can't read the web site that writen by unicode

12/03/2005 14:12




0

Created with FORUM 2.0.11