#define UTFG 1 /* Copyright 2001-2004 Unicode, Inc.; modifications Copyright 2007 Wenlin Institute, Inc. * * Disclaimer * * This source code is provided as is by Unicode, Inc. No claims are * made as to fitness for any particular purpose. No warranties of any * kind are expressed or implied. The recipient agrees to determine * applicability of information provided. If this file has been * purchased on magnetic or optical media from Unicode, Inc., the * sole remedy for any claim will be exchange of defective media * within 90 days of receipt. * * Limitations on Rights to Redistribute This Code * * Unicode, Inc. hereby grants the right to freely use the information * supplied in this file in the creation of products supporting the * Unicode Standard, and to make copies of this file in any form * for internal or external distribution as long as this notice * remains attached. * * (Wenlin Institute, Inc., likewise for the modifications.) * * utfg_harness.c * * This is a test harness for "ConvertUTF.c". Compile this * and run without arguments. It will exhaustively test * the conversion routines, and print a few lines of diagnostic * output. You don't need to compile ConvertUTF.c itself, * since it gets #included here along with the header. * Example of a compile line: * * $ gcc -g utfg_harness.c -o utfg_harness * * Rev History: Rick McGowan, new file April 2001. * Sept 19, 2002: Corrected error on line 234: utf16_buf[2] becomes utf16_result[2] * per report from Iain Murray. * July 3, 2003: Updated printout message. * Oct 19, 2004: Updated isLegalUTF8 test data and corrected switch statements to catch * illegal surrogate use in UTF-8, per report from Frank Tang. * * Thomas Bishop, extended for UTF-G, July 2007. */ #define CVTUTF_DEBUG 1 #include #include # tbishop added #include # tbishop added #if UTFG #include "ConvertUTFG.c" #define ConvertUTF8toUTF16 ConvertUTFG8toUTFG16 #define ConvertUTF16toUTF8 ConvertUTFG16toUTFG8 #define ConvertUTF8toUTF32 ConvertUTFG8toUTFG32 #define ConvertUTF32toUTF8 ConvertUTFG32toUTFG8 #define ConvertUTF32toUTF16 ConvertUTFG32toUTFG16 #define ConvertUTF16toUTF32 ConvertUTFG16toUTFG32 #define isLegalUTF8 isLegalUTFG8 #define isLegalUTF8Sequence isLegalUTFG8Sequence #else #include "ConvertUTF.c" #endif /* --------------------------------------------------------------------- test01 - Spot check a few legal & illegal UTF-8 values only. This is not an exhaustive test, just a brief one that was used to develop the "isLegalUTF8" routine. Legal UTF-8 sequences are: 1st---- 2nd---- 3rd---- 4th---- Codepoints--- 00-7F 0000- 007F C2-DF 80-BF 0080- 07FF E0 A0-BF 80-BF 0800- 0FFF E1-EC 80-BF 80-BF 1000- CFFF ED 80-9F 80-BF D000- D7FF EE-EF 80-BF 80-BF E000- FFFF F0 90-BF 80-BF 80-BF 10000- 3FFFF F1-F3 80-BF 80-BF 80-BF 40000- FFFFF F4 80-8F 80-BF 80-BF 100000-10FFFF --------------------------------------------------------------------- */ struct utf8_test { Boolean utf8_legal; /* is legal sequence? */ int utf8_len; /* length of sequence */ unsigned char utf8_seq[5]; /* the sequence */ }; struct utf8_test utf8_testData[] = { { 1, 1, { 0x7A, 0x00, 0x00, 0x00, 0x00 }}, /* 0 */ { 1, 2, { 0xC2, 0xAC, 0x00, 0x00, 0x00 }}, /* 1 */ { 1, 2, { 0xDF, 0xB2, 0x00, 0x00, 0x00 }}, /* 2 */ { 1, 3, { 0xE0, 0xA1, 0x81, 0x00, 0x00 }}, /* 3 */ { 1, 3, { 0xE1, 0xAC, 0x90, 0x00, 0x00 }}, /* 4 */ { 1, 3, { 0xF0, 0x93, 0xB2, 0xA1, 0x00 }}, /* 5 */ { 1, 4, { 0xF1, 0x87, 0x9A, 0xB0, 0x00 }}, /* 6 */ { 1, 4, { 0xF3, 0x88, 0x9B, 0xAD, 0x00 }}, /* 7 */ { 1, 4, { 0xF4, 0x82, 0x89, 0x8F, 0x00 }}, /* 8 */ { 0, 3, { 0x82, 0x00, 0x00, 0x00, 0x00 }}, /* 9 */ #if UTFG /* tbishop changed utf8_len from 2 to 4, according to initial byte. */ { 0, 4, { 0xF8, 0xAC, 0x00, 0x00, 0x00 }}, /* 10 */ #else { 0, 2, { 0xF8, 0xAC, 0x00, 0x00, 0x00 }}, /* 10 */ #endif { 0, 2, { 0xE1, 0xFC, 0xFF, 0x00, 0x00 }}, /* 11 */ { 0, 3, { 0xC2, 0xFC, 0x00, 0x00, 0x00 }}, /* 12 */ { 0, 3, { 0xE1, 0xC2, 0x81, 0x00, 0x00 }}, /* 13 */ { 0, 2, { 0xC2, 0xC1, 0x00, 0x00, 0x00 }}, /* 14 */ { 0, 2, { 0xC0, 0xAF, 0x00, 0x00, 0x00 }}, /* 15 */ { 0, 3, { 0xE0, 0x9F, 0x80, 0x00, 0x00 }}, /* 16 */ { 0, 4, { 0xF0, 0x93, 0xB2, 0xC1, 0x00 }}, /* 17 */ { 1, 3, { 0xED, 0x9F, 0xBF, 0x00, 0x00 }}, /* 18 */ { 1, 3, { 0xEE, 0x80, 0x80, 0x00, 0x00 }}, /* 19 */ { 0, 3, { 0xED, 0xA0, 0x80, 0x00, 0x00 }}, /* 20 */ { 0, 3, { 0xED, 0xBF, 0xBF, 0x00, 0x00 }}, /* 21 */ /* for all > 21 use "short" buffer lengths to detect over-run */ { 0, 4, { 0xF0, 0x93, 0xB2, 0xC3, 0x00 }}, /* 18 use short buflen */ { 0, 0, { 0x00, 0x00, 0x00, 0x00, 0x00 }}, }; int test01() { int i; int rval, wantVal1, wantVal2, gotVal1, gotVal2, len2; printf("Begin Test01\n"); fflush(stdout); rval = 0; for (i = 0; utf8_testData[i].utf8_len; i++) { wantVal1 = wantVal2 = utf8_testData[i].utf8_legal; gotVal1 = isLegalUTF8(&(utf8_testData[i].utf8_seq[0]), utf8_testData[i].utf8_len); /* use truncated length for tests over 21 */ if (i <= 21) { len2 = 4; } else { len2 = utf8_testData[i].utf8_len-1; wantVal2 = 0; } gotVal2 = isLegalUTF8Sequence(&(utf8_testData[i].utf8_seq[0]), &(utf8_testData[i].utf8_seq[0])+len2); if ((gotVal1 != wantVal1) || (gotVal2 != wantVal2)) { printf("Test01 error: seq %d is %d & %d (should be %d & %d) for bytes (%x,%x,%x,%x,%x,) & len %d\n", i, gotVal1, gotVal2, wantVal1, wantVal2, utf8_testData[i].utf8_seq[0], utf8_testData[i].utf8_seq[1], utf8_testData[i].utf8_seq[2], utf8_testData[i].utf8_seq[3], utf8_testData[i].utf8_seq[4], utf8_testData[i].utf8_len); ++rval; } } return (rval ? 0 : 1); } /* --------------------------------------------------------------------- test02 - Test round trip UTF32 -> UTF16 -> UTF8 -> UTF16 -> UTF32 This is an exhaustive test of values 0 through 0x10FFFF. It takes each integer value and converts from UTC4 through the other encoding forms, and back to UTR32, checking the results along the way. It does not check the un-paired low surrogates, except for the first low surrogate. It intends to get that one illegal result, prints a message, and continues with tests. --------------------------------------------------------------------- */ int test02() { #if UTFG UTF32 i, n; ConversionResult result; #define BUF_UNITS 10 UTF32 utf32_buf[BUF_UNITS], utf32_result[BUF_UNITS]; UTF16 utf16_buf[BUF_UNITS], utf16_result[BUF_UNITS]; UTF8 utf8_buf[BUF_UNITS]; #else int i, n; ConversionResult result; UTF32 utf32_buf[2], utf32_result[2]; UTF16 utf16_buf[3], utf16_result[3]; UTF8 utf8_buf[8]; #endif UTF32 *utf32SourceStart, *utf32TargetStart; UTF16 *utf16SourceStart, *utf16TargetStart; UTF8 *utf8SourceStart, *utf8TargetStart; printf("Begin Test02\n"); fflush(stdout); #if UTFG for (i = 0; i <= 0x7FFFFFFF; i++) { memset(utf32_result, 0, sizeof(utf32_result)); memset(utf16_buf, 0, sizeof(utf16_buf)); memset(utf16_result, 0, sizeof(utf16_result)); memset(utf8_buf, 0, sizeof(utf8_buf)); utf32_buf[0] = i; utf32_buf[1] = 0; #else for (i = 0; i <= 0x10FFFF; i++) { utf32_buf[0] = i; utf32_buf[1] = 0; utf32_result[0] = utf32_result[1] = 0; utf16_buf[0] = utf16_buf[1] = utf16_buf[2] = 0; utf16_result[0] = utf16_result[1] = utf16_result[2] = 0; for (n = 0; n < 8; n++) utf8_buf[n] = 0; #endif utf32SourceStart = utf32_buf; utf32TargetStart = utf32_result; utf16TargetStart = utf16SourceStart = utf16_buf; utf8TargetStart = utf8SourceStart = utf8_buf; /* * Test UTF32 -> UTF16 */ #if UTFG result = ConvertUTF32toUTF16((const UTF32 **) &utf32SourceStart, &(utf32_buf[1]), &utf16TargetStart, &(utf16_buf[BUF_UNITS - 1]), strictConversion); #else result = ConvertUTF32toUTF16((const UTF32 **) &utf32SourceStart, &(utf32_buf[1]), &utf16TargetStart, &(utf16_buf[2]), strictConversion); #endif if (i < UNI_SUR_HIGH_START || i > UNI_SUR_LOW_END) { /* skip result checking for all but 0000d800, which we know to be illegal */ switch (result) { default: fprintf(stderr, "Test02A fatal error: result %d for input %08x\n", result, utf32_buf[0]); exit(1); case conversionOK: break; case sourceExhausted: printf("sourceExhausted\t"); break; case targetExhausted: printf("targetExhausted\t"); break; case sourceIllegal: printf("sourceIllegal\t"); break; } } if (result != conversionOK) { if (i <= UNI_SUR_HIGH_START || i > UNI_SUR_LOW_END) { printf("Test02A for %d, input %08x, output %04x,%04x, result %d\n", i, utf32_buf[0], utf16_buf[0], utf16_buf[1], result); if ((i != UNI_SUR_HIGH_START) || (result != sourceIllegal)) { return 0; } else { printf("!!! Test02A: note expected illegal result for 0x0000D800\n"); } } } if (i > UNI_SUR_HIGH_START && i <= UNI_SUR_LOW_END) continue; /* * Test UTF16 -> UTF8, with legality check on. We check for everything except * for unpaired low surrogates. We do make one check that the lowest low * surrogate, when unpaired, is illegal. */ #if UTFG { int utf16UnitCount = 1; while (utf16_buf[utf16UnitCount] != 0) { ++utf16UnitCount; } result = ConvertUTF16toUTF8((const UTF16 **) &utf16SourceStart, &(utf16_buf[utf16UnitCount]), &utf8TargetStart, &(utf8_buf[BUF_UNITS - 1]), strictConversion); } #else result = ConvertUTF16toUTF8((const UTF16 **) &utf16SourceStart, &(utf16_buf[2]), &utf8TargetStart, &(utf8_buf[7]), strictConversion); #endif switch (result) { default: fprintf(stderr, "Test02B fatal error: result %d for input %08x\n", result, utf32_buf[0]); exit(1); case conversionOK: break; case sourceExhausted: printf("sourceExhausted\t"); break; case targetExhausted: printf("targetExhausted\t"); break; case sourceIllegal: printf("sourceIllegal\t"); break; } if (result != conversionOK) { printf("Test02B for %d (0x%x), input %04x,%04x; output %s; result %d\n", i, utf32_buf[0], utf16_buf[0], utf16_buf[1], utf8_buf, result); #if UTFG { int j; for (j = 0; utf16_buf[j] != 0; j++) { printf("utf16_buf[%d] = %04x\n", j, utf16_buf[j]); } } #endif if ((i != UNI_SUR_LOW_START) && (i != UNI_SUR_HIGH_START)) { return 0; } else { /* Note: This illegal result only happens if we remove the surrogate check in Test02A. So it shouldn't be seen unless that check and the "continue" are removed in the test above. */ if (i == UNI_SUR_LOW_START) printf("!!! Test02B: note expected illegal result for 0xDC00,0000\n"); else if (i == UNI_SUR_HIGH_START) printf("!!! Test02B: note expected illegal result for 0xD800,0000\n"); } } if ((i == UNI_SUR_LOW_START) && result != sourceIllegal) { printf("Test02B for %d (0x%x), input %04x,%04x; output %s; result %d\n", i, utf32_buf[0], utf16_buf[0], utf16_buf[1], utf8_buf, result); printf("Test02B: expected illegal result for 0xDC00,0000 was not flagged illegal.\n"); return 0; } if ((i >= UNI_SUR_HIGH_START) & (i <= UNI_SUR_LOW_END)) continue; /* * Reset some result buffer pointers for the trip back. */ utf32SourceStart = utf32_buf; utf32TargetStart = utf32_result; utf16TargetStart = utf16SourceStart = utf16_result; utf8TargetStart = utf8SourceStart = utf8_buf; /* * Test UTF8 -> UTF16, with legality check on. */ #if UTFG result = ConvertUTF8toUTF16((const UTF8 **) &utf8SourceStart, &(utf8_buf[trailingBytesForUTF8[utf8_buf[0]]+1]), &utf16TargetStart, &(utf16_result[BUF_UNITS - 1]), strictConversion); #else result = ConvertUTF8toUTF16((const UTF8 **) &utf8SourceStart, &(utf8_buf[trailingBytesForUTF8[utf8_buf[0]]+1]), &utf16TargetStart, &(utf16_result[2]), strictConversion); #endif switch (result) { default: fprintf(stderr, "Test02C fatal error: result %d for input %08x\n", result, utf32_buf[0]); exit(1); case conversionOK: break; case sourceExhausted: printf("sourceExhausted\t"); break; case targetExhausted: printf("targetExhausted\t"); break; case sourceIllegal: printf("sourceIllegal\t"); break; } if (result != conversionOK) { printf("Test02C for %d (0x%x), input %s; output %04x,%04x; result %d\n", i, utf32_buf[0], utf8_buf, utf16_buf[0], utf16_buf[1], result); return 0; } #if UTFG for (n = 0; n < BUF_UNITS; n++) { /* check that the utf16 result is the same as what went in. */ #else for (n = 0; n < 3; n++) { /* check that the utf16 result is the same as what went in. */ #endif if (utf16_buf[n] != utf16_result[n]) { printf("Test02C error: input = 0x%08x; utf16_buf = 0x%04x,0x%04x; utf16_result = 0x%04x,0x%04x\n", utf32_buf[0], utf16_buf[0], utf16_buf[1], utf16_result[0], utf16_result[1]); return 0; } } /* * Test UTF16 -> UTF32, with legality check on. If the result of our previous * conversion gave us a "surrogate pair", then we need to convert 2 entities * back to UTF32. */ #if UTFG { int utf16UnitCount = 1; while (utf16_result[utf16UnitCount] != 0) { ++utf16UnitCount; } result = ConvertUTF16toUTF32((const UTF16 **) &utf16SourceStart, &(utf16_result[utf16UnitCount]), &utf32TargetStart, &(utf32_result[1]), strictConversion); } #else if (utf16_result[0] >= UNI_SUR_HIGH_START && utf16_result[0] <= UNI_SUR_HIGH_END) { result = ConvertUTF16toUTF32((const UTF16 **) &utf16SourceStart, &(utf16_result[2]), &utf32TargetStart, &(utf32_result[1]), strictConversion); } else { result = ConvertUTF16toUTF32((const UTF16 **) &utf16SourceStart, &(utf16_result[1]), &utf32TargetStart, &(utf32_result[1]), strictConversion); } #endif switch (result) { default: fprintf(stderr, "Test02D fatal error: result %d for input %08x\n", result, utf32_buf[0]); exit(1); case conversionOK: break; case sourceExhausted: printf("sourceExhausted\t"); break; case targetExhausted: printf("targetExhausted\t"); break; case sourceIllegal: printf("sourceIllegal\t"); break; } if (result != conversionOK) { printf("Test02D for %d (0x%x), input %04x,%04x; output %08x; result %d\n", i, utf32_buf[0], utf16_buf[0], utf16_buf[1], utf32_result[0], result); return 0; } /* * Now, check the final round-trip value. */ if (utf32_buf[0] != utf32_result[0]) { printf("Test02E for %d: utf32 input %08x; trip output %08x (utf_16buf is %04x,%04x)\n", i, utf32_buf[0], utf32_result[0], utf16_buf[0], utf16_buf[1]); return 0; } } return 1; } /* --------------------------------------------------------------------- test03 - Test round trip UTF32 -> UTF8 -> UTF32 This tests the functions that were not tested by test02 above. For each UTF32 value 0 through 0x7FFFFFFF, it tests the conversion to UTF-8 and back. The test is exhaustive. --------------------------------------------------------------------- */ int test03() { int i, n; ConversionResult result; UTF32 utf32_buf[2], utf32_result[2]; UTF8 utf8_buf[8]; UTF32 *utf32SourceStart, *utf32TargetStart; UTF8 *utf8SourceStart, *utf8TargetStart; printf("Begin Test03\n"); fflush(stdout); #if UTFG for (i = 0; i <= 0x7FFFFFFF; i++) { #else for (i = 0; i <= 0x10FFFF; i++) { #endif /* Skip all surrogates except UNI_SUR_HIGH_START, which we test for illegality. */ if (i > UNI_SUR_HIGH_START && i <= UNI_SUR_LOW_END) continue; utf32_buf[0] = i; utf32_buf[1] = 0; utf32_result[0] = utf32_result[1] = 0; for (n = 0; n < 8; n++) utf8_buf[n] = 0; utf32SourceStart = utf32_buf; utf32TargetStart = utf32_result; utf8TargetStart = utf8SourceStart = utf8_buf; /* * Test UTF32 -> UTF8, with legality check on. */ result = ConvertUTF32toUTF8((const UTF32 **) &utf32SourceStart, &(utf32_buf[1]), & utf8TargetStart, &(utf8_buf[7]), strictConversion); switch (result) { default: fprintf(stderr, "Test03A fatal error: result %d for input %08x\n", result, utf32_buf[0]); exit(1); case conversionOK: break; case sourceExhausted: printf("sourceExhausted\t"); break; case targetExhausted: printf("targetExhausted\t"); break; case sourceIllegal: printf("sourceIllegal\t"); break; } if (result != conversionOK) { printf("Test03A for %d (0x%x); output %s; result %d\n", i, utf32_buf[0], utf8_buf, result); if (i != UNI_SUR_HIGH_START) { return 0; } else { printf("!!! Test03A: note expected illegal result for 0x0000D800\n"); } } if ((i == UNI_SUR_HIGH_START) && result != sourceIllegal) { printf("Test03A for %d (0x%x); output %s; result %d\n", i, utf32_buf[0], utf8_buf, result); printf("Test03A: expected illegal result for 0x0000D800 was not flagged illegal.\n"); return 0; } if ((i >= UNI_SUR_HIGH_START) & (i <= UNI_SUR_LOW_END)) continue; /* * Reset some result buffer pointers for the trip back. */ utf32SourceStart = utf32_buf; utf32TargetStart = utf32_result; utf8TargetStart = utf8SourceStart = utf8_buf; /* * Test UTF8 -> UTF32, with legality check on. */ result = ConvertUTF8toUTF32((const UTF8 **) &utf8SourceStart, &(utf8_buf[trailingBytesForUTF8[utf8_buf[0]]+1]), &utf32TargetStart, &(utf32_result[1]), strictConversion); switch (result) { default: fprintf(stderr, "Test03B fatal error: result %d for input %08x\n", result, utf32_buf[0]); exit(1); case conversionOK: break; case sourceExhausted: printf("sourceExhausted\t"); break; case targetExhausted: printf("targetExhausted\t"); break; case sourceIllegal: printf("sourceIllegal\t"); break; } if (result != conversionOK) { printf("Test03B for %d (0x%x), input %s; output 0x%08x; result %d\n", i, utf32_buf[0], utf8_buf, utf32_result[0], result); return 0; } /* * Now, check the final round-trip value. */ if (utf32_buf[0] != utf32_result[0]) { printf("Test03C for %d: utf32 input %08x; utf8 buf %s; trip output %08x\n", i, utf32_buf[0], utf8_buf, utf32_result[0]); return 0; } } return 1; } /* --------------------------------------------------------------------- test04 - Test an illegal UTF-32 value > 10FFFF conversion to UTF-8. Expect it will be turned into UNI_REPLACEMENT_CHAR. --------------------------------------------------------------------- */ int test04() { UTF32 i, n; ConversionResult result; UTF32 utf32_buf[2]; UTF8 utf8_buf[8]; UTF32 *utf32SourceStart, *utf32TargetStart; UTF8 *utf8SourceStart, *utf8TargetStart; printf("Begin Test04\n"); fflush(stdout); #if UTFG utf32_buf[0] = (UTF32) 0x7FFFFFFF + 21; /* an arbitrary value > legal */ utf32_buf[1] = 0; #else i = 0x10FFFF + 21; /* an arbitrary value > legal */ utf32_buf[0] = i; utf32_buf[1] = 0; #endif for (n = 0; n < 8; n++) utf8_buf[n] = 0; utf32SourceStart = utf32_buf; utf8TargetStart = utf8_buf; /* * Test UTF32 -> UTF8, with legality check on. */ result = ConvertUTF32toUTF8((const UTF32 **) &utf32SourceStart, &(utf32_buf[1]), & utf8TargetStart, &(utf8_buf[7]), strictConversion); if (result != sourceIllegal) { fprintf(stderr, "Test04A fatal error: result %d for input %08x\n", result, utf32_buf[0]); exit(1); } return 1; } /* --------------------------------------------------------------------- */ main() { printf("Three tests of round-trip conversions will be performed.\n"); printf("One test of illegal UTF-32 will be performed.\n"); /* tbishop was "peroformed" */ printf("Two illegal result messages are expected; one in test 02A; one in test 03A.\n"); printf("These are for tests of Surrogate conversion.\n\n"); fflush(stdout); if (test01()) { printf("******** Test01 succeeded without error. ********\n\n"); } else { printf("-------- Test01 failed. --------\n\n"); } if (test02()) { printf("******** Test02 succeeded without error. ********\n\n"); } else { printf("-------- Test02 failed. --------\n\n"); } if (test03()) { printf("******** Test03 succeeded without error. ********\n\n"); } else { printf("-------- Test03 failed. --------\n\n"); } if (test04()) { printf("******** Test04 succeeded without error. ********\n\n"); } else { printf("-------- Test04 failed. --------\n\n"); } }