Browse Source

Enhancement of codepage detection & detects codepage for embedded font

LouYihua 7 years ago
parent
commit
be2bf2566c
5 changed files with 96 additions and 56 deletions
  1. 11 0
      font.c
  2. 10 55
      global.c
  3. 1 1
      global.h
  4. 66 0
      text.c
  5. 8 0
      text.h

+ 11 - 0
font.c

@@ -83,6 +83,17 @@ static void PAL_LoadEmbeddedFont(void)
 	//
 	fclose(fp);
 
+	//
+	// Detect the codepage of 'wor16.asc' and exit if not BIG5 or probability < 99
+	// Note: 100% probability is impossible as the function does not recognize some special
+	// characters such as bopomofo that may be used by 'wor16.asc'.
+	//
+	if (PAL_DetectCodePageForString(char_buf, nBytes, CP_BIG5, &i) != CP_BIG5 || i < 99)
+	{
+		free(char_buf);
+		return;
+	}
+
 	//
 	// Convert characters into unicode
 	// Explictly specify BIG5 here for compatibility with codepage auto-detection

+ 10 - 55
global.c

@@ -111,21 +111,15 @@ PAL_IsWINVersion_Exit:
 
 CODEPAGE
 PAL_DetectCodePage(
-	void
+	const char *   filename
 )
 {
-	// Try to convert the content of word.dat with different codepages,
-	// and use the codepage with minimal inconvertible characters
-	// Works fine currently for detecting Simplified Chinese & Traditional Chinese.
-	// Since we're using language files to support additional languages, this detection
-	// should be fine for us now.
-
 	FILE *fp;
 	char *word_buf = NULL;
 	long word_len = 0;
-	CODEPAGE cp = CP_BIG5;	// Defaults to BIG5
+	CODEPAGE cp = CP_BIG5;
 
-	if (NULL != (fp = UTIL_OpenFile("word.dat")))
+	if (NULL != (fp = UTIL_OpenFile(filename)))
 	{
 		fseek(fp, 0, SEEK_END);
 		word_len = ftell(fp);
@@ -143,54 +137,15 @@ PAL_DetectCodePage(
 
 	if (word_buf)
 	{
-		// The WORD.DAT should not contain characters outside these ranges
-		const static int valid_ranges[][2] = {
-			{ 0x4E00, 0x9FFF }, // CJK Unified Ideographs
-			{ 0x3400, 0x4DBF }, // CJK Unified Ideographs Extension A
-			{ 0xF900, 0xFAFF }, // CJK Compatibility Ideographs
-			{ 0x0030, 0x0039 }, // 0 - 9, ASCII
-			{ 0xFF10, 0xFF19 }, // 0 - 9, full wide
-			{ 0x0041, 0x005A }, // A - Z, ASCII
-			{ 0xFF21, 0xFF3A }, // A - Z, full wide
-			{ 0x0061, 0x007A }, // a - z, ASCII
-			{ 0xFF41, 0xFF5A }, // a - z, full wide
-		};
-		int min_invalids = INT_MAX;
-
-		for (CODEPAGE i = CP_BIG5; i <= CP_GBK; i++)
-		{
-			int invalids, length = PAL_MultiByteToWideCharCP(i, word_buf, word_len, NULL, 0);
-			WCHAR *wbuf = (WCHAR *)malloc(length * sizeof(WCHAR));
-			PAL_MultiByteToWideCharCP(i, word_buf, word_len, wbuf, length);
-			for (int j = invalids = 0; j < length; j++)
-			{
-				int score = 1;
-				if (iswspace(wbuf[j]) != 0) continue;
-				for (int k = 0; k < sizeof(valid_ranges) / sizeof(valid_ranges[0]); k++)
-				{
-					if (wbuf[j] >= valid_ranges[k][0] &&
-						wbuf[j] <= valid_ranges[k][1])
-					{
-						score = 0;
-						break;
-					}
-				}
-				invalids += score;
-			}
-			// code page with less invalid chars wins
-			if (invalids < min_invalids)
-			{
-				min_invalids = invalids;
-				cp = i;
-			}
-			free(wbuf);
-		}
+		int probability;
+		cp = PAL_DetectCodePageForString(word_buf, (int)word_len, cp, &probability);
+
 		free(word_buf);
 
-		if (min_invalids == 0)
-			UTIL_LogOutput(LOGLEVEL_INFO, "PAL_DetectCodePage detected code page: %s\n", cp ? "GBK" : "BIG5");
+		if (probability == 100)
+			UTIL_LogOutput(LOGLEVEL_INFO, "PAL_DetectCodePage detected code page '%s' for %s\n", cp ? "GBK" : "BIG5", filename);
 		else
-			UTIL_LogOutput(LOGLEVEL_WARNING, "PAL_DetectCodePage detected possible code page: %s [%d invalids]\n", cp ? "GBK" : "BIG5", min_invalids);
+			UTIL_LogOutput(LOGLEVEL_WARNING, "PAL_DetectCodePage detected the most possible (%d) code page '%s' for %s\n", probability, cp ? "GBK" : "BIG5", filename);
 	}
 
 	return cp;
@@ -235,7 +190,7 @@ PAL_InitGlobals(
    //
    // Detect game language only when no message file specified
    //
-   if (!gConfig.pszMsgFile) PAL_SetCodePage(PAL_DetectCodePage());
+   if (!gConfig.pszMsgFile) PAL_SetCodePage(PAL_DetectCodePage("word.dat"));
 
    //
    // Set decompress function

+ 1 - 1
global.h

@@ -556,7 +556,7 @@ PAL_IsWINVersion(
 
 CODEPAGE
 PAL_DetectCodePage(
-	void
+	const char *   filename
 );
 
 INT

+ 66 - 0
text.c

@@ -1545,6 +1545,72 @@ PAL_SetCodePage(
 	g_codepage = uCodePage;
 }
 
+CODEPAGE
+PAL_DetectCodePageForString(
+	const char *   text,
+	int            text_len,
+	CODEPAGE       default_cp,
+	int *          probability
+)
+{
+	// Try to convert the content of word.dat with different codepages,
+	// and use the codepage with minimal inconvertible characters
+	// Works fine currently for detecting Simplified Chinese & Traditional Chinese.
+	// Since we're using language files to support additional languages, this detection
+	// should be fine for us now.
+	int min_invalids = INT_MAX;
+
+	if (text && text_len > 0)
+	{
+		// The file to be detected should not contain characters outside these ranges
+		const static int valid_ranges[][2] = {
+			{ 0x4E00, 0x9FFF }, // CJK Unified Ideographs
+			{ 0x3400, 0x4DBF }, // CJK Unified Ideographs Extension A
+			{ 0xF900, 0xFAFF }, // CJK Compatibility Ideographs
+			{ 0x0020, 0x007E }, // Basic ASCII
+			{ 0x3000, 0x301E }, // CJK Symbols
+			{ 0xFF01, 0xFF5E }, // Fullwidth Forms
+		};
+
+		for (CODEPAGE i = CP_BIG5; i <= CP_GBK; i++)
+		{
+			int invalids, length = PAL_MultiByteToWideCharCP(i, text, text_len, NULL, 0);
+			WCHAR *wbuf = (WCHAR *)malloc(length * sizeof(WCHAR));
+			PAL_MultiByteToWideCharCP(i, text, text_len, wbuf, length);
+			for (int j = invalids = 0; j < length; j++)
+			{
+				int score = 1;
+				for (int k = 0; k < sizeof(valid_ranges) / sizeof(valid_ranges[0]); k++)
+				{
+					if (wbuf[j] >= valid_ranges[k][0] &&
+						wbuf[j] <= valid_ranges[k][1])
+					{
+						score = 0;
+						break;
+					}
+				}
+				invalids += score;
+			}
+			// code page with less invalid chars wins
+			if (invalids < min_invalids)
+			{
+				min_invalids = invalids;
+				default_cp = i;
+			}
+			free(wbuf);
+		}
+	}
+	if (probability)
+	{
+		if (min_invalids < text_len / 2)
+			*probability = (text_len / 2 - min_invalids) * 200 / text_len;
+		else
+			*probability = 0;
+	}
+
+	return default_cp;
+}
+
 INT
 PAL_MultiByteToWideCharCP(
    CODEPAGE      cp,

+ 8 - 0
text.h

@@ -143,6 +143,14 @@ PAL_SetCodePage(
 	CODEPAGE    uCodePage
 );
 
+CODEPAGE
+PAL_DetectCodePageForString(
+	const char *   text,
+	int            text_len,
+	CODEPAGE       default_cp,
+	int *          probability
+);
+
 INT
 PAL_swprintf(
 	LPWSTR buffer,