| And here's the diff:
diff -rudb httrack-3.39.53.orig/src/htscore.c httrack-3.39.53/src/htscore.c
--- httrack-3.39.53.orig/src/htscore.c 2005-12-03 14:11:28.875560939 +0100
+++ httrack-3.39.53/src/htscore.c 2005-12-03 14:08:33.000000000 +0100
@@ -1118,7 +1118,7 @@
nspec += map[i];
}
}
- /* On-the-fly UCS2 to ISO-8859-1 conversion (note: UCS2 should
never be used on the net) */
+ /* On-the-fly UCS2 to UTF-8 conversion (note: UCS2 should never
be used on the net) */
if (
map[0] > r.size/10
&&
@@ -1129,30 +1129,101 @@
||
( ((unsigned char) r.adr[0]) == 0xfe && ((unsigned char)
r.adr[1]) == 0xff)
)
- ) {
- int lost=0;
+ )
+ {
+#define CH_ADD(c) do { \
+ if (new_offs + 1 > new_capa) { \
+ new_capa *= 2; \
+ new_adr = (unsigned char*) realloct(new_adr, \
+ new_capa); \
+ assertf(new_adr != NULL); \
+ } \
+ new_adr[new_offs++] = (unsigned char) (c); \
+} while(0)
+#define CH_ADD_RNG1(c, r, o) do { \
+ CH_ADD( (c) / (r) + (o) ); \
+ c = (c) % (r); \
+} while(0)
+#define CH_ADD_RNG0(c, o) do { \
+ CH_ADD_RNG1(c, 1, o); \
+} while(0)
+#define CH_ADD_RNG2(c, r, r2, o) do { \
+ CH_ADD_RNG1(c, (r) * (r2), o); \
+} while(0)
+ int new_capa = r.size / 2 + 1;
+ int new_offs = 0;
+ unsigned char* prev_adr = (unsigned char*) r.adr;
+ unsigned char* new_adr = (unsigned char*) malloct(new_capa);
int i;
int swap = (((unsigned char)r.adr[0]) == 0xff);
- for(i = 0 ; i < r.size / 2 - 1 ; i++) {
- unsigned int unic = 0;
+ assertf(new_adr != NULL);
+ /*
+ See <http://www.unicode.org/reports/tr28/tr28-3.html#conformance>
+ U+0000..U+007F 00..7F
+ U+0080..U+07FF C2..DF 80..BF
+ U+0800..U+0FFF E0 A0..BF 80..BF
+ U+1000..U+CFFF E1..EC 80..BF 80..BF
+ U+D000..U+D7FF ED 80..9F 80..BF
+ U+D800..U+DFFF
+ U+E000..U+FFFF EE..EF 80..BF 80..BF
+ */
+ for(i = 0 ; i < r.size / 2 ; i++) {
+ unsigned short int unic = 0;
if (swap)
- unic = (r.adr[i*2 + 2] << 8) + r.adr[i*2 + 2 + 1];
+ unic = prev_adr[i*2] + (prev_adr[i*2 + 1] << 8);
else
- unic = r.adr[i*2 + 2] + (r.adr[i*2 + 2 + 1] << 8);
- if (unic <= 255)
- r.adr[i] = (char) unic;
- else {
- r.adr[i] = '?';
- lost++;
+ unic = (prev_adr[i*2] << 8) + prev_adr[i*2 + 1];
+ if (unic <= 0x7F) {
+ /* U+0000..U+007F 00..7F */
+ CH_ADD_RNG0( unic, 0x00 );
+ } else if (unic <= 0x07FF) {
+ /* U+0080..U+07FF C2..DF 80..BF */
+ unic -= 0x0080;
+ CH_ADD_RNG1( unic, 0xbf - 0x80 + 1, 0xc2 );
+ CH_ADD_RNG0( unic, 0x80 );
+ } else if (unic <= 0x0FFF) {
+ /* U+0800..U+0FFF E0 A0..BF 80..BF */
+ unic -= 0x0800;
+ CH_ADD_RNG2( unic, 0xbf - 0x80 + 1, 0xbf - 0xa0 + 1, 0xe0 );
+ CH_ADD_RNG1( unic, 0xbf - 0x80 + 1, 0xa0 );
+ CH_ADD_RNG0( unic, 0x80 );
+ } else if (unic <= 0xCFFF) {
+ /* U+1000..U+CFFF E1..EC 80..BF 80..BF */
+ unic -= 0x1000;
+ CH_ADD_RNG2( unic, 0xbf - 0x80 + 1, 0xbf - 0x80 + 1, 0xe1 );
+ CH_ADD_RNG1( unic, 0xbf - 0x80 + 1, 0x80 );
+ CH_ADD_RNG0( unic, 0x80 );
+ } else if (unic <= 0xD7FF) {
+ /* U+D000..U+D7FF ED 80..9F 80..BF */
+ unic -= 0xD000;
+ CH_ADD_RNG2( unic, 0xbf - 0x80 + 1, 0x9f - 0x80 + 1, 0xed );
+ CH_ADD_RNG1( unic, 0xbf - 0x80 + 1, 0x80 );
+ CH_ADD_RNG0( unic, 0x80 );
+ } else if (unic <= 0xDFFF) {
+ /* U+D800..U+DFFF */
+ CH_ADD('?');
+ /* ill-formed */
+ } else if (unic <= 0xFFFF) {
+ /* U+E000..U+FFFF EE..EF 80..BF 80..BF */
+ unic -= 0xE000;
+ CH_ADD_RNG2( unic, 0xbf - 0x80 + 1, 0xbf - 0x80 + 1, 0xee );
+ CH_ADD_RNG1( unic, 0xbf - 0x80 + 1, 0x80 );
+ CH_ADD_RNG0( unic, 0x80 );
}
}
- r.size = r.size / 2 - 1;
- r.adr[r.size] = '\0';
-
if (opt.errlog) {
- fspc(opt.errlog,"warning"); fprintf(opt.errlog,"File %s%s
converted from UCS2 to 8-bit, %d characters lost during conversion (better to
use UTF-8)"LF, urladr, urlfil, lost);
+ fspc(opt.errlog,"warning"); fprintf(opt.errlog,"File %s%s converted
from UCS2 to UTF-8 (old size: %d bytes, new size: %d bytes)"LF, urladr,
urlfil, (int)r.size, new_offs);
test_flush;
}
+ freet(r.adr);
+ r.adr = NULL;
+ r.size = new_offs;
+ CH_ADD(0);
+ r.adr = (char*) new_adr;
+#undef CH_ADD
+#undef CH_ADD_RNG0
+#undef CH_ADD_RNG1
+#undef CH_ADD_RNG2
} else if ((nspec > r.size / 100) && (nspec > 10)) { // too
many special characters
strcpybuff(r.contenttype,"application/octet-stream");
if (opt.errlog) {
| |