Closed GoogleCodeExporter closed 9 years ago
Here is my 2.04 fix to boxread.cpp:
bool read_next_box(int target_page, FILE* box_file, char* utf8_str,
int* x_min, int* y_min, int* x_max, int* y_max) {
static int line = 0;
int count = 0;
int page = 0;
char buff[kBoxReadBufSize]; //boxfile read buffer
char *uch;
char *buffptr = buff;
while (fgets(buff, sizeof(buff) - 1, box_file)) {
line++;
buffptr = buff;
const unsigned char *ubuf = reinterpret_cast<const unsigned char*>(buffptr);
if (ubuf[0] == 0xef && ubuf[1] == 0xbb && ubuf[2] == 0xbf)
buffptr += 3; // Skip unicode file designation.
/* Check for blank lines in box file */
while (*buffptr == ' ' || *buffptr == '\t')
buffptr++;
// change to simply scan over the uch rather than fail on bad unicode sscan(%s") which fails on Tibetan
consonants
uch = buffptr;
while (*buffptr != ' ' && *buffptr != '\t')
buffptr++;
if (*buffptr != '\0') {
*buffptr++ = 0;
count = sscanf(buffptr, "%d %d %d %d %d",
x_min, y_min, x_max, y_max, &page);
if (count != 5) {
page = 0;
count = sscanf(buffptr, "%d %d %d %d",
x_min, y_min, x_max, y_max);
}
if (target_page >= 0 && target_page != page)
continue; // Not on the appropriate page.
if (count == 4) {
#if debug_utf8
// Print the hex codes of the utf8 code.
int x;
for (x = 0; buff[x] != '\0'; ++x)
tprintf("[%02x]", (unsigned char)buff[x]);
tprintf("\n");
for (x = 0; uch[x] != '\0'; ++x)
tprintf("[%02x]", (unsigned char)uch[x]);
tprintf(" %d %d %d %d\n", *x_min, *y_min, *x_max, *y_max);
#endif
// Validate UTF8 by making unichars with it.
int used = 0;
int uch_len = strlen(uch);
while (used < uch_len) {
UNICHAR ch(uch + used, uch_len - used);
int new_used = ch.utf8_len();
if (new_used == 0) {
tprintf("Bad utf-8 char starting with 0x%02x at col %d, line %d \n",
(unsigned char)uch[used], used + 1, line);
count = 0;
break;
}
used += new_used;
}
if (uch_len > UNICHAR_LEN) {
tprintf("utf-8 string too long at line %d\n", line);
count = 0;
}
}
if (count < 4) {
tprintf("Box file format error on line %i ignored\n", line);
} else {
strcpy(utf8_str, uch);
return true; //read a box ok
}
}
}
fclose(box_file);
line = 0;
return false; //EOF
}
Original comment by bgspe...@gmail.com
on 1 Nov 2009 at 7:51
This issue was closed by revision r339.
Original comment by theraysm...@gmail.com
on 19 May 2010 at 10:07
Original issue reported on code.google.com by
bgspe...@gmail.com
on 15 Oct 2009 at 12:33