Browse Source

Text enhancement: a truely worked codepage auto-detector

Previously, I've implemented a codepage auto-detector. But it did not
work properly so it was removed from the code base, and a user-configured
codepage selector was used.
Since now other languages are handled through the language-file mechanism,
we just have to detect the two variants of Chinese language now. Now,
by fixing the algorithm used, this function works perfectly for detecting
Simplified Chinese and Traditional Chinese.
As a result, the user-configured codepage selector is now useless and will
be removed in a future commit along with some other enhancements.
LouYihua 7 years ago
parent
commit
27d5a1e3c2
6 changed files with 150 additions and 26 deletions
  1. 93 1
      global.c
  2. 5 0
      global.h
  3. 2 2
      palcommon.h
  4. 4 4
      script.c
  5. 36 19
      text.c
  6. 10 0
      text.h

+ 93 - 1
global.c

@@ -109,6 +109,93 @@ PAL_IsWINVersion_Exit:
 	return result;
 }
 
+CODEPAGE
+PAL_DetectCodePage(
+	void
+)
+{
+	// Try to convert the content of word.dat with different codepages,
+	// and use the codepage with minimal inconvertible characters
+	// Works fine currently for detecting Simplified Chinese & Traditional Chinese.
+	// Since we're using language files to support additional languages, this detection
+	// should be fine for us now.
+
+	FILE *fp;
+	char *word_buf = NULL;
+	long word_len = 0;
+	CODEPAGE cp = CP_BIG5;	// Defaults to BIG5
+
+	if (NULL != (fp = UTIL_OpenFile("word.dat")))
+	{
+		fseek(fp, 0, SEEK_END);
+		word_len = ftell(fp);
+		word_buf = (char *)malloc(word_len);
+		fseek(fp, 0, SEEK_SET);
+		fread(word_buf, 1, word_len, fp);
+		UTIL_CloseFile(fp);
+		// Eliminates null characters so that PAL_MultiByteToWideCharCP works properly
+		for (char *ptr = word_buf; ptr < word_buf + word_len; ptr++)
+		{
+			if (!*ptr)
+				*ptr = ' ';
+		}
+	}
+
+	if (word_buf)
+	{
+		// The WORD.DAT should not contain characters outside these ranges
+		const static int valid_ranges[][2] = {
+			{ 0x4E00, 0x9FFF }, // CJK Unified Ideographs
+			{ 0x3400, 0x4DBF }, // CJK Unified Ideographs Extension A
+			{ 0xF900, 0xFAFF }, // CJK Compatibility Ideographs
+			{ 0x0030, 0x0039 }, // 0 - 9, ASCII
+			{ 0xFF10, 0xFF19 }, // 0 - 9, full wide
+			{ 0x0041, 0x005A }, // A - Z, ASCII
+			{ 0xFF21, 0xFF3A }, // A - Z, full wide
+			{ 0x0061, 0x007A }, // a - z, ASCII
+			{ 0xFF41, 0xFF5A }, // a - z, full wide
+		};
+		int min_invalids = INT_MAX;
+
+		for (CODEPAGE i = CP_BIG5; i <= CP_GBK; i++)
+		{
+			int invalids, length = PAL_MultiByteToWideCharCP(i, word_buf, word_len, NULL, 0);
+			WCHAR *wbuf = (WCHAR *)malloc(length * sizeof(WCHAR));
+			PAL_MultiByteToWideCharCP(i, word_buf, word_len, wbuf, length);
+			for (int j = invalids = 0; j < length; j++)
+			{
+				int score = 1;
+				if (iswspace(wbuf[j])) continue;
+				for (int k = 0; k < sizeof(valid_ranges) / sizeof(valid_ranges[0]); k++)
+				{
+					if (wbuf[j] >= valid_ranges[k][0] &&
+						wbuf[j] <= valid_ranges[k][1])
+					{
+						score = 0;
+						break;
+					}
+				}
+				invalids += score;
+			}
+			// code page with less invalid chars wins
+			if (invalids < min_invalids)
+			{
+				min_invalids = invalids;
+				cp = i;
+			}
+			free(wbuf);
+		}
+		free(word_buf);
+
+		if (min_invalids == 0)
+			UTIL_LogOutput(LOGLEVEL_INFO, "%s detected code page: %s\n", __func__, cp ? "GBK" : "BIG5");
+		else
+			UTIL_LogOutput(LOGLEVEL_WARNING, "%s detected possible code page: %s [%d invalids]\n", __func__, cp ? "GBK" : "BIG5", min_invalids);
+	}
+
+	return cp;
+}
+
 INT
 PAL_InitGlobals(
    VOID
@@ -141,10 +228,15 @@ PAL_InitGlobals(
    gpGlobals->f.fpSSS = UTIL_OpenRequiredFile("sss.mkf");
 
    //
-   // Retrieve game resource version & language
+   // Retrieve game resource version
    //
    if (!PAL_IsWINVersion(&gConfig.fIsWIN95)) return -1;
 
+   //
+   // Detect game language only when no message file specified
+   //
+   if (!gConfig.pszMsgFile) PAL_SetCodePage(PAL_DetectCodePage());
+
    //
    // Set decompress function
    //

+ 5 - 0
global.h

@@ -554,6 +554,11 @@ PAL_IsWINVersion(
    BOOL *pfIsWIN95
 );
 
+CODEPAGE
+PAL_DetectCodePage(
+	void
+);
+
 INT
 PAL_InitGlobals(
    VOID

+ 2 - 2
palcommon.h

@@ -117,8 +117,8 @@ typedef enum tagCODEPAGE {
 	CP_MIN = 0,
 	CP_BIG5 = 0,
 	CP_GBK = 1,
-	CP_SHIFTJIS = 2,
-	CP_JISX0208 = 3,
+	//CP_SHIFTJIS = 2,
+	//CP_JISX0208 = 3,
 	CP_MAX = CP_GBK + 1,
 	CP_UTF_8 = CP_MAX + 1
 } CODEPAGE;

+ 4 - 4
script.c

@@ -561,7 +561,7 @@ PAL_AdditionalCredits(
    for (i = 0; i < 12; i++)
    {
       WCHAR buffer[48];
-      PAL_swprintf(buffer, sizeof(buffer) / sizeof(WCHAR), rgszStrings[i], gConfig.pszMsgFile ? g_rcCredits[i] : rgszcps[i][gConfig.uCodePage]);
+      PAL_swprintf(buffer, sizeof(buffer) / sizeof(WCHAR), rgszStrings[i], gConfig.pszMsgFile ? g_rcCredits[i] : rgszcps[i][PAL_GetCodePage()]);
       PAL_DrawText(buffer, PAL_XY(0, 2 + i * 16), DESCTEXT_COLOR, TRUE, FALSE, FALSE);
    }
 
@@ -3044,9 +3044,9 @@ PAL_RunTriggerScript(
    {
       pScript = &(gpGlobals->g.lprgScriptEntry[wScriptEntry]);
 
-      UTIL_LogOutput(LOGLEVEL_DEBUG, "[SCRIPT] %.4x: %.4x %.4x %.4x %.4x\n", wScriptEntry,
-         pScript->wOperation, pScript->rgwOperand[0], pScript->rgwOperand[1],
-         pScript->rgwOperand[2], pScript->rgwOperand[3]);
+      UTIL_LogOutput(LOGLEVEL_DEBUG, "[SCRIPT] %.4x: %.4x %.4x %.4x\n", wScriptEntry,
+         pScript->wOperation, pScript->rgwOperand[0],
+         pScript->rgwOperand[1], pScript->rgwOperand[2]);
 
       switch (pScript->wOperation)
       {

+ 36 - 19
text.c

@@ -698,10 +698,10 @@ PAL_InitText(
 
 	   g_TextLib.lpIndexBuf = NULL;
 
-	   memcpy(g_TextLib.lpWordBuf + SYSMENU_LABEL_LAUNCHSETTING, gc_rgszSDLPalWords[gConfig.uCodePage], SDLPAL_EXTRA_WORD_COUNT * sizeof(LPCWSTR));
+	   memcpy(g_TextLib.lpWordBuf + SYSMENU_LABEL_LAUNCHSETTING, gc_rgszSDLPalWords[PAL_GetCodePage()], SDLPAL_EXTRA_WORD_COUNT * sizeof(LPCWSTR));
 
 #ifndef PAL_CLASSIC
-	   memcpy(g_TextLib.lpWordBuf + SYSMENU_LABEL_BATTLEMODE, gc_rgszAdditionalWords[gConfig.uCodePage], ATB_WORD_COUNT * sizeof(LPCWSTR));
+	   memcpy(g_TextLib.lpWordBuf + SYSMENU_LABEL_BATTLEMODE, gc_rgszAdditionalWords[PAL_GetCodePage()], ATB_WORD_COUNT * sizeof(LPCWSTR));
 #endif
    }
 
@@ -1512,6 +1512,39 @@ PAL_DialogIsPlayingRNG(
    return g_TextLib.fPlayingRNG;
 }
 
+WCHAR
+PAL_GetInvalidChar(
+	CODEPAGE      uCodePage
+)
+{
+	switch (uCodePage)
+	{
+	case CP_BIG5:     return 0x3f;
+	case CP_GBK:      return 0x3f;
+		//case CP_SHIFTJIS: return 0x30fb;
+	case CP_UTF_8:    return 0x3f;
+	default:          return 0;
+	}
+}
+
+static CODEPAGE g_codepage = CP_UTF_8;
+
+CODEPAGE
+PAL_GetCodePage(
+	void
+)
+{
+	return g_codepage;
+}
+
+void
+PAL_SetCodePage(
+	CODEPAGE    uCodePage
+)
+{
+	g_codepage = uCodePage;
+}
+
 INT
 PAL_MultiByteToWideCharCP(
    CODEPAGE      cp,
@@ -1797,7 +1830,7 @@ PAL_MultiByteToWideChar(
 
 --*/
 {
-	return PAL_MultiByteToWideCharCP(gConfig.uCodePage, mbs, mbslength, wcs, wcslength);
+	return PAL_MultiByteToWideCharCP(g_codepage, mbs, mbslength, wcs, wcslength);
 }
 
 INT
@@ -2108,19 +2141,3 @@ PAL_swprintf(
 	va_end(ap);
 	return count;
 }
-
-
-WCHAR
-PAL_GetInvalidChar(
-   CODEPAGE      uCodePage
-)
-{
-   switch(uCodePage)
-   {
-   case CP_BIG5:     return 0x3f;
-   case CP_GBK:      return 0x3f;
-   //case CP_SHIFTJIS: return 0x30fb;
-   case CP_UTF_8:    return 0x3f;
-   default:          return 0;
-   }
-}

+ 10 - 0
text.h

@@ -133,6 +133,16 @@ PAL_GetInvalidChar(
    CODEPAGE      uCodePage
 );
 
+CODEPAGE
+PAL_GetCodePage(
+	void
+);
+
+void
+PAL_SetCodePage(
+	CODEPAGE    uCodePage
+);
+
 INT
 PAL_swprintf(
 	LPWSTR buffer,