Hi,
i came across gendict.cxx while fixing a possible memleak. It took me some
time to figure out what the code did.
I notice a lot of very very long function bodies in LO-code, gendict was no
exception. So i refactored the code, did some google searches on gendict and
was able to fix the memleak.
I ended up submitting only the fix, not the refactoring, because i didn't
want to break any de facto coding style guideliness and i am still a fairly
new contributer to LO.
I submit these patches now, so you guys can decide if you push them or not.
In case it could save some other new contributor some time in understanding
gendict ;)
BTW, i tested the code on ja.dic and output is still the same.
-- Kenneth
From 7155a3675cf3410ab74dc7032bceaef719548d3c Mon Sep 17 00:00:00 2001
From: Kenneth Venken <kenneth.venken@gmail.com>
Date: Thu, 27 Jan 2011 22:27:24 +0100
Subject: [PATCH 1/8] added some documentation to gendict
---
i18npool/source/breakiterator/gendict.cxx | 17 ++++++++++++++++-
1 files changed, 16 insertions(+), 1 deletions(-)
diff --git a/i18npool/source/breakiterator/gendict.cxx b/i18npool/source/breakiterator/gendict.cxx
index 9f49f67..8a6354b 100644
--- a/i18npool/source/breakiterator/gendict.cxx
+++ b/i18npool/source/breakiterator/gendict.cxx
@@ -41,7 +41,22 @@ using std::vector;
using namespace ::rtl;
-/* Main Procedure */
+/* Utility gendict:
+
+ "BreakIterator_CJK provides input string caching and dictionary searching for
+ longest matching. You can provide a sorted dictionary (the encoding must be
+ UTF-8) by creating the following file:
+ i18npool/source/breakiterator/data/<language>.dict.
+
+ The utility gendict will convert the file to C code, which will be compiled
+ into a shared library for dynamic loading.
+
+ All dictionary searching and loading is performed in the xdictionary class.
+ The only thing you need to do is to derive your class from BreakIterator_CJK
+ and create an instance of the xdictionary with the language name and
+ pass it to the parent class." (from http://wiki.services.openoffice.org/wiki/
+ /Documentation/DevGuide/OfficeDev/Implementing_a_New_Locale - 27/01/2011)
+*/
SAL_IMPLEMENT_MAIN_WITH_ARGS(argc, argv)
{
--
1.7.1
From d44b67f81303f56246f93a6eb6419f371d5b215f Mon Sep 17 00:00:00 2001
From: Kenneth Venken <kenneth.venken@gmail.com>
Date: Thu, 27 Jan 2011 22:43:49 +0100
Subject: [PATCH 2/8] refactored out some simple print functions
---
i18npool/source/breakiterator/gendict.cxx | 35 +++++++++++++++++++----------
1 files changed, 23 insertions(+), 12 deletions(-)
diff --git a/i18npool/source/breakiterator/gendict.cxx b/i18npool/source/breakiterator/gendict.cxx
index 8a6354b..df7e144 100644
--- a/i18npool/source/breakiterator/gendict.cxx
+++ b/i18npool/source/breakiterator/gendict.cxx
@@ -58,6 +58,9 @@ using namespace ::rtl;
/Documentation/DevGuide/OfficeDev/Implementing_a_New_Locale - 27/01/2011)
*/
+void printIncludes(FILE *source_fp);
+void printFunctions(FILE *source_fp);
+
SAL_IMPLEMENT_MAIN_WITH_ARGS(argc, argv)
{
FILE *sfp, *cfp;
@@ -79,12 +82,7 @@ SAL_IMPLEMENT_MAIN_WITH_ARGS(argc, argv)
return -1;
}
- fprintf(cfp, "/*\n");
- fprintf(cfp, " * Copyright(c) 1999 - 2000, Sun Microsystems, Inc.\n");
- fprintf(cfp, " * All Rights Reserved.\n");
- fprintf(cfp, " */\n\n");
- fprintf(cfp, "/* !!!The file is generated automatically. DONOT edit the file manually!!!
*/\n\n");
- fprintf(cfp, "#include <sal/types.h>\n\n");
+ printIncludes(cfp);
fprintf(cfp, "extern \"C\" {\n");
sal_Int32 count, i, j;
@@ -209,12 +207,6 @@ SAL_IMPLEMENT_MAIN_WITH_ARGS(argc, argv)
}
fprintf (cfp, "\n};\n");
- // create function to return arrays
- fprintf (cfp, "\tconst sal_uInt8* getExistMark() { return existMark; }\n");
- fprintf (cfp, "\tconst sal_Int16* getIndex1() { return index1; }\n");
- fprintf (cfp, "\tconst sal_Int32* getIndex2() { return index2; }\n");
- fprintf (cfp, "\tconst sal_Int32* getLenArray() { return lenArray; }\n");
- fprintf (cfp, "\tconst sal_Unicode* getDataArea() { return dataArea; }\n");
fprintf (cfp, "}\n");
fclose(sfp);
@@ -223,4 +215,23 @@ SAL_IMPLEMENT_MAIN_WITH_ARGS(argc, argv)
return 0;
} // End of main
+void printIncludes(FILE* source_fp)
+{
+ fprintf(source_fp, "/*\n");
+ fprintf(source_fp, " * Copyright(c) 1999 - 2000, Sun Microsystems, Inc.\n");
+ fprintf(source_fp, " * All Rights Reserved.\n");
+ fprintf(source_fp, " */\n\n");
+ fprintf(source_fp, "/* !!!The file is generated automatically. DO NOT edit the file
manually!!! */\n\n");
+ fprintf(source_fp, "#include <sal/types.h>\n\n");
+}
+
+void printFunctions(FILE* source_fp)
+{
+ fprintf (source_fp, "\tconst sal_uInt8* getExistMark() { return existMark; }\n");
+ fprintf (source_fp, "\tconst sal_Int16* getIndex1() { return index1; }\n");
+ fprintf (source_fp, "\tconst sal_Int32* getIndex2() { return index2; }\n");
+ fprintf (source_fp, "\tconst sal_Int32* getLenArray() { return lenArray; }\n");
+ fprintf (source_fp, "\tconst sal_Unicode* getDataArea() { return dataArea; }\n");
+}
+
/* vim:set shiftwidth=4 softtabstop=4 expandtab: */
--
1.7.1
From f7c0500f2f74b785b27d185eb533e49de6753160 Mon Sep 17 00:00:00 2001
From: Kenneth Venken <kenneth.venken@gmail.com>
Date: Thu, 27 Jan 2011 23:02:11 +0100
Subject: [PATCH 3/8] refactored out dataArea
---
i18npool/source/breakiterator/gendict.cxx | 104 +++++++++++++++++------------
1 files changed, 60 insertions(+), 44 deletions(-)
diff --git a/i18npool/source/breakiterator/gendict.cxx b/i18npool/source/breakiterator/gendict.cxx
index df7e144..eb654f5 100644
--- a/i18npool/source/breakiterator/gendict.cxx
+++ b/i18npool/source/breakiterator/gendict.cxx
@@ -59,6 +59,10 @@ using namespace ::rtl;
*/
void printIncludes(FILE *source_fp);
+void initArrays(sal_Bool *exists, sal_Int32 *charArray);
+void printDataArea(FILE *sfp, FILE *cfp, sal_Int32 count, sal_Int32 i,
+ sal_Int32 lenArrayCurr, sal_Int32 *charArray,
+ vector<sal_Int32>& lenArray, sal_Bool *exists);
void printFunctions(FILE *source_fp);
SAL_IMPLEMENT_MAIN_WITH_ARGS(argc, argv)
@@ -89,51 +93,9 @@ SAL_IMPLEMENT_MAIN_WITH_ARGS(argc, argv)
sal_Int32 lenArrayCurr = 0, charArray[0x10000];
vector<sal_Int32> lenArray;
sal_Bool exist[0x10000];
- for (i = 0; i < 0x10000; i++) {
- exist[i] = sal_False;
- charArray[i] = 0;
- }
-
- // generate main dict. data array
- fprintf(cfp, "static const sal_Unicode dataArea[] = {");
- sal_Char str[1024];
- sal_Unicode current = 0;
- count = 0;
- while (fgets(str, 1024, sfp)) {
- // input file is in UTF-8 encoding
- // don't convert last new line character to Ostr.
- OUString Ostr((const sal_Char *)str, strlen(str) - 1, RTL_TEXTENCODING_UTF8);
- const sal_Unicode *u = Ostr.getStr();
-
- sal_Int32 len = Ostr.getLength();
-
- i=0;
- Ostr.iterateCodePoints(&i, 1);
- if (len == i) continue; // skip one character word
-
- if (*u != current) {
- if (*u < current)
- printf("u %x, current %x, count %d, lenArray.size() %d\n", *u, current,
- sal::static_int_cast<int>(count), sal::static_int_cast<int>(lenArray.size()));
- current = *u;
- charArray[current] = lenArray.size();
- }
+ initArrays( exist, charArray );
- lenArray.push_back(lenArrayCurr);
-
- exist[u[0]] = sal_True;
- for (i = 1; i < len; i++) { // start from second character,
- exist[u[i]] = sal_True; // since the first character is captured in charArray.
- lenArrayCurr++;
- if ((count++) % 0x10 == 0)
- fprintf(cfp, "\n\t");
- fprintf(cfp, "0x%04x, ", u[i]);
- }
- }
- lenArray.push_back( lenArrayCurr ); // store last ending pointer
-
- charArray[current+1] = lenArray.size();
- fprintf(cfp, "\n};\n");
+ printDataArea(sfp, cfp, count, i, lenArrayCurr, charArray, lenArray, exist);
// generate lenArray
fprintf(cfp, "static const sal_Int32 lenArray[] = {\n\t");
@@ -215,6 +177,14 @@ SAL_IMPLEMENT_MAIN_WITH_ARGS(argc, argv)
return 0;
} // End of main
+void initArrays(sal_Bool* exists, sal_Int32* charArray)
+{
+ for (sal_Int32 i = 0; i < 0x10000; i++) {
+ exists[i] = sal_False;
+ charArray[i] = 0;
+ }
+}
+
void printIncludes(FILE* source_fp)
{
fprintf(source_fp, "/*\n");
@@ -234,4 +204,50 @@ void printFunctions(FILE* source_fp)
fprintf (source_fp, "\tconst sal_Unicode* getDataArea() { return dataArea; }\n");
}
+void printDataArea(FILE *sfp, FILE *cfp, sal_Int32 count, sal_Int32 i,
+ sal_Int32 lenArrayCurr, sal_Int32 *charArray,
+ vector<sal_Int32>& lenArray, sal_Bool *exists)
+{
+ // generate main dict. data array
+ fprintf(cfp, "static const sal_Unicode dataArea[] = {");
+ sal_Char str[1024];
+ sal_Unicode current = 0;
+ count = 0;
+ while (fgets(str, 1024, sfp)) {
+ // input file is in UTF-8 encoding
+ // don't convert last new line character to Ostr.
+ OUString Ostr((const sal_Char *)str, strlen(str) - 1, RTL_TEXTENCODING_UTF8);
+ const sal_Unicode *u = Ostr.getStr();
+
+ sal_Int32 len = Ostr.getLength();
+
+ i=0;
+ Ostr.iterateCodePoints(&i, 1);
+ if (len == i) continue; // skip one character word
+
+ if (*u != current) {
+ if (*u < current)
+ printf("u %x, current %x, count %d, lenArray.size() %d\n", *u, current,
+ sal::static_int_cast<int>(count), sal::static_int_cast<int>(lenArray.size()));
+ current = *u;
+ charArray[current] = lenArray.size();
+ }
+
+ lenArray.push_back(lenArrayCurr);
+
+ exists[u[0]] = sal_True;
+ for (i = 1; i < len; i++) { // start from second character,
+ exists[u[i]] = sal_True; // since the first character is captured in charArray.
+ lenArrayCurr++;
+ if ((count++) % 0x10 == 0)
+ fprintf(cfp, "\n\t");
+ fprintf(cfp, "0x%04x, ", u[i]);
+ }
+ }
+ lenArray.push_back( lenArrayCurr ); // store last ending pointer
+
+ charArray[current+1] = lenArray.size();
+ fprintf(cfp, "\n};\n");
+}
+
/* vim:set shiftwidth=4 softtabstop=4 expandtab: */
--
1.7.1
From 1e5a7d4cd5e13895ce4db4f0fea85f8e77ffe876 Mon Sep 17 00:00:00 2001
From: Kenneth Venken <kenneth.venken@gmail.com>
Date: Thu, 27 Jan 2011 23:43:38 +0100
Subject: [PATCH 4/8] refactored out all array functions
---
i18npool/source/breakiterator/gendict.cxx | 188 ++++++++++++++++-------------
1 files changed, 104 insertions(+), 84 deletions(-)
diff --git a/i18npool/source/breakiterator/gendict.cxx b/i18npool/source/breakiterator/gendict.cxx
index eb654f5..315acfa 100644
--- a/i18npool/source/breakiterator/gendict.cxx
+++ b/i18npool/source/breakiterator/gendict.cxx
@@ -60,9 +60,15 @@ using namespace ::rtl;
void printIncludes(FILE *source_fp);
void initArrays(sal_Bool *exists, sal_Int32 *charArray);
-void printDataArea(FILE *sfp, FILE *cfp, sal_Int32 count, sal_Int32 i,
+void printDataArea(FILE *sfp, FILE *source_fp, sal_Int32 count, sal_Int32 i,
sal_Int32 lenArrayCurr, sal_Int32 *charArray,
vector<sal_Int32>& lenArray, sal_Bool *exists);
+void printLenArray(FILE *source_fp, const vector<sal_Int32>& lenArray,
+ sal_Int32 count);
+void printIndex1(FILE *source_fp, sal_Int32 *charArray, sal_Int32 count,
+ sal_Int16 *set);
+void printIndex2(FILE *source_fp, sal_Int32 *charArray, sal_Int16 *set);
+void printExistMark(FILE *source_fp, sal_Bool *exists, sal_Int32 count);
void printFunctions(FILE *source_fp);
SAL_IMPLEMENT_MAIN_WITH_ARGS(argc, argv)
@@ -86,89 +92,21 @@ SAL_IMPLEMENT_MAIN_WITH_ARGS(argc, argv)
return -1;
}
- printIncludes(cfp);
- fprintf(cfp, "extern \"C\" {\n");
-
- sal_Int32 count, i, j;
+ sal_Int32 count, i;
sal_Int32 lenArrayCurr = 0, charArray[0x10000];
vector<sal_Int32> lenArray;
sal_Bool exist[0x10000];
- initArrays( exist, charArray );
-
- printDataArea(sfp, cfp, count, i, lenArrayCurr, charArray, lenArray, exist);
-
- // generate lenArray
- fprintf(cfp, "static const sal_Int32 lenArray[] = {\n\t");
- count = 1;
- fprintf(cfp, "0x%x, ", 0); // insert one slat for skipping 0 in index2 array.
- for (size_t k = 0; k < lenArray.size(); k++)
- {
- fprintf(cfp, "0x%lx, ", static_cast<long unsigned int>(lenArray[k]));
- if (count == 0xf)
- {
- count = 0;
- fprintf(cfp, "\n\t");
- }
- else count++;
- }
- fprintf(cfp, "\n};\n");
-
- // generate index1 array
- fprintf (cfp, "static const sal_Int16 index1[] = {\n\t");
sal_Int16 set[0x100];
- count = 0;
- for (i = 0; i < 0x100; i++) {
- for (j = 0; j < 0x100; j++)
- if (charArray[(i*0x100) + j] != 0)
- break;
-
- fprintf(cfp, "0x%02x, ", set[i] = (j < 0x100 ? sal::static_int_cast<sal_Int16>(count++) :
0xff));
- if ((i+1) % 0x10 == 0)
- fprintf (cfp, "\n\t");
- }
- fprintf (cfp, "};\n");
-
- // generate index2 array
- fprintf (cfp, "static const sal_Int32 index2[] = {\n\t");
- sal_Int32 prev = 0;
- for (i = 0; i < 0x100; i++) {
- if (set[i] != 0xff) {
- for (j = 0; j < 0x100; j++) {
- sal_Int32 k = (i*0x100) + j;
- if (prev != 0 && charArray[k] == 0) {
- for (k++; k < 0x10000; k++)
- if (charArray[k] != 0)
- break;
- }
- prev = charArray[(i*0x100) + j];
- fprintf(
- cfp, "0x%lx, ",
- sal::static_int_cast< unsigned long >(
- k < 0x10000 ? charArray[k] + 1 : 0));
- if ((j+1) % 0x10 == 0)
- fprintf (cfp, "\n\t");
- }
- fprintf (cfp, "\n\t");
- }
- }
- fprintf (cfp, "\n};\n");
-
- // generate existMark array
- count = 0;
- fprintf (cfp, "static const sal_uInt8 existMark[] = {\n\t");
- for (i = 0; i < 0x1FFF; i++) {
- sal_uInt8 bit = 0;
- for (j = 0; j < 8; j++)
- if (exist[i * 8 + j])
- bit |= 1 << j;
- fprintf(cfp, "0x%02x, ", bit);
- if (count == 0xf) {
- count = 0;
- fprintf(cfp, "\n\t");
- } else count++;
- }
- fprintf (cfp, "\n};\n");
+ initArrays( exist, charArray );
+ printIncludes(cfp);
+ fprintf(cfp, "extern \"C\" {\n");
+ printDataArea(sfp, cfp, count, i, lenArrayCurr, charArray, lenArray, exist);
+ printLenArray(cfp, lenArray, count);
+ printIndex1(cfp, charArray, count, set);
+ printIndex2(cfp, charArray, set);
+ printExistMark(cfp, exist, count);
+ printFunctions(cfp);
fprintf (cfp, "}\n");
fclose(sfp);
@@ -204,12 +142,12 @@ void printFunctions(FILE* source_fp)
fprintf (source_fp, "\tconst sal_Unicode* getDataArea() { return dataArea; }\n");
}
-void printDataArea(FILE *sfp, FILE *cfp, sal_Int32 count, sal_Int32 i,
+void printDataArea(FILE *sfp, FILE *source_fp, sal_Int32 count, sal_Int32 i,
sal_Int32 lenArrayCurr, sal_Int32 *charArray,
vector<sal_Int32>& lenArray, sal_Bool *exists)
{
// generate main dict. data array
- fprintf(cfp, "static const sal_Unicode dataArea[] = {");
+ fprintf(source_fp, "static const sal_Unicode dataArea[] = {");
sal_Char str[1024];
sal_Unicode current = 0;
count = 0;
@@ -240,14 +178,96 @@ void printDataArea(FILE *sfp, FILE *cfp, sal_Int32 count, sal_Int32 i,
exists[u[i]] = sal_True; // since the first character is captured in charArray.
lenArrayCurr++;
if ((count++) % 0x10 == 0)
- fprintf(cfp, "\n\t");
- fprintf(cfp, "0x%04x, ", u[i]);
+ fprintf(source_fp, "\n\t");
+ fprintf(source_fp, "0x%04x, ", u[i]);
}
}
lenArray.push_back( lenArrayCurr ); // store last ending pointer
charArray[current+1] = lenArray.size();
- fprintf(cfp, "\n};\n");
+ fprintf(source_fp, "\n};\n");
+}
+
+void printLenArray(FILE* source_fp, const vector<sal_Int32>& lenArray,
+ sal_Int32 count)
+{
+ fprintf(source_fp, "static const sal_Int32 lenArray[] = {\n\t");
+ count = 1;
+ fprintf(source_fp, "0x%x, ", 0); // insert one slat for skipping 0 in index2 array.
+ for (size_t k = 0; k < lenArray.size(); k++)
+ {
+ fprintf(source_fp, "0x%lx, ", static_cast<long unsigned int>(lenArray[k]));
+ if (count == 0xf)
+ {
+ count = 0;
+ fprintf(source_fp, "\n\t");
+ }
+ else count++;
+ }
+ fprintf(source_fp, "\n};\n");
+}
+
+void printIndex1(FILE *source_fp, sal_Int32 *charArray, sal_Int32 count,
+ sal_Int16 *set)
+{
+ fprintf (source_fp, "static const sal_Int16 index1[] = {\n\t");
+ count = 0;
+ sal_Int32 j;
+ for (sal_Int32 i = 0; i < 0x100; i++) {
+ for (j = 0; j < 0x100; j++)
+ if (charArray[(i*0x100) + j] != 0)
+ break;
+
+ fprintf(source_fp, "0x%02x, ", set[i] = (j < 0x100 ?
sal::static_int_cast<sal_Int16>(count++) : 0xff));
+ if ((i+1) % 0x10 == 0)
+ fprintf (source_fp, "\n\t");
+ }
+ fprintf (source_fp, "};\n");
+}
+
+void printIndex2(FILE *source_fp, sal_Int32 *charArray, sal_Int16 *set)
+{
+ fprintf (source_fp, "static const sal_Int32 index2[] = {\n\t");
+ sal_Int32 prev = 0;
+ for (sal_Int32 i = 0; i < 0x100; i++) {
+ if (set[i] != 0xff) {
+ for (sal_Int32 j = 0; j < 0x100; j++) {
+ sal_Int32 k = (i*0x100) + j;
+ if (prev != 0 && charArray[k] == 0) {
+ for (k++; k < 0x10000; k++)
+ if (charArray[k] != 0)
+ break;
+ }
+ prev = charArray[(i*0x100) + j];
+ fprintf(
+ source_fp, "0x%lx, ",
+ sal::static_int_cast< unsigned long >(
+ k < 0x10000 ? charArray[k] + 1 : 0));
+ if ((j+1) % 0x10 == 0)
+ fprintf (source_fp, "\n\t");
+ }
+ fprintf (source_fp, "\n\t");
+ }
+ }
+ fprintf (source_fp, "\n};\n");
+}
+
+void printExistMark(FILE *source_fp, sal_Bool *exists, sal_Int32 count)
+{
+ count = 0;
+ fprintf (source_fp, "static const sal_uInt8 existMark[] = {\n\t");
+ for (sal_Int32 i = 0; i < 0x1FFF; i++) {
+ sal_uInt8 bit = 0;
+ for (sal_Int32 j = 0; j < 8; j++)
+ if (exists[i * 8 + j])
+ bit |= 1 << j;
+ fprintf(source_fp, "0x%02x, ", bit);
+ if (count == 0xf) {
+ count = 0;
+ fprintf(source_fp, "\n\t");
+ } else count++;
+ }
+ fprintf (source_fp, "\n};\n");
}
/* vim:set shiftwidth=4 softtabstop=4 expandtab: */
--
1.7.1
From 8df201b3e266826392e6c59fb885d71cecbeeeb7 Mon Sep 17 00:00:00 2001
From: Kenneth Venken <kenneth.venken@gmail.com>
Date: Thu, 27 Jan 2011 23:52:52 +0100
Subject: [PATCH 5/8] reduced scope of some variables
---
i18npool/source/breakiterator/gendict.cxx | 43 +++++++++++++----------------
1 files changed, 19 insertions(+), 24 deletions(-)
diff --git a/i18npool/source/breakiterator/gendict.cxx b/i18npool/source/breakiterator/gendict.cxx
index 315acfa..93a359b 100644
--- a/i18npool/source/breakiterator/gendict.cxx
+++ b/i18npool/source/breakiterator/gendict.cxx
@@ -60,15 +60,12 @@ using namespace ::rtl;
void printIncludes(FILE *source_fp);
void initArrays(sal_Bool *exists, sal_Int32 *charArray);
-void printDataArea(FILE *sfp, FILE *source_fp, sal_Int32 count, sal_Int32 i,
- sal_Int32 lenArrayCurr, sal_Int32 *charArray,
+void printDataArea(FILE *sfp, FILE *source_fp, sal_Int32 *charArray,
vector<sal_Int32>& lenArray, sal_Bool *exists);
-void printLenArray(FILE *source_fp, const vector<sal_Int32>& lenArray,
- sal_Int32 count);
-void printIndex1(FILE *source_fp, sal_Int32 *charArray, sal_Int32 count,
- sal_Int16 *set);
+void printLenArray(FILE *source_fp, const vector<sal_Int32>& lenArray);
+void printIndex1(FILE *source_fp, sal_Int32 *charArray, sal_Int16 *set);
void printIndex2(FILE *source_fp, sal_Int32 *charArray, sal_Int16 *set);
-void printExistMark(FILE *source_fp, sal_Bool *exists, sal_Int32 count);
+void printExistMark(FILE *source_fp, sal_Bool *exists);
void printFunctions(FILE *source_fp);
SAL_IMPLEMENT_MAIN_WITH_ARGS(argc, argv)
@@ -92,7 +89,6 @@ SAL_IMPLEMENT_MAIN_WITH_ARGS(argc, argv)
return -1;
}
- sal_Int32 count, i;
sal_Int32 lenArrayCurr = 0, charArray[0x10000];
vector<sal_Int32> lenArray;
sal_Bool exist[0x10000];
@@ -101,11 +97,11 @@ SAL_IMPLEMENT_MAIN_WITH_ARGS(argc, argv)
printIncludes(cfp);
fprintf(cfp, "extern \"C\" {\n");
- printDataArea(sfp, cfp, count, i, lenArrayCurr, charArray, lenArray, exist);
- printLenArray(cfp, lenArray, count);
- printIndex1(cfp, charArray, count, set);
+ printDataArea(sfp, cfp, charArray, lenArray, exist);
+ printLenArray(cfp, lenArray);
+ printIndex1(cfp, charArray, set);
printIndex2(cfp, charArray, set);
- printExistMark(cfp, exist, count);
+ printExistMark(cfp, exist);
printFunctions(cfp);
fprintf (cfp, "}\n");
@@ -142,15 +138,16 @@ void printFunctions(FILE* source_fp)
fprintf (source_fp, "\tconst sal_Unicode* getDataArea() { return dataArea; }\n");
}
-void printDataArea(FILE *sfp, FILE *source_fp, sal_Int32 count, sal_Int32 i,
- sal_Int32 lenArrayCurr, sal_Int32 *charArray,
+void printDataArea(FILE *sfp, FILE *source_fp,
+ sal_Int32 *charArray,
vector<sal_Int32>& lenArray, sal_Bool *exists)
{
// generate main dict. data array
fprintf(source_fp, "static const sal_Unicode dataArea[] = {");
sal_Char str[1024];
+ sal_Int32 lenArrayCurr = 0;
sal_Unicode current = 0;
- count = 0;
+ sal_Int32 count = 0;
while (fgets(str, 1024, sfp)) {
// input file is in UTF-8 encoding
// don't convert last new line character to Ostr.
@@ -159,7 +156,7 @@ void printDataArea(FILE *sfp, FILE *source_fp, sal_Int32 count, sal_Int32 i,
sal_Int32 len = Ostr.getLength();
- i=0;
+ sal_Int32 i=0;
Ostr.iterateCodePoints(&i, 1);
if (len == i) continue; // skip one character word
@@ -188,11 +185,10 @@ void printDataArea(FILE *sfp, FILE *source_fp, sal_Int32 count, sal_Int32 i,
fprintf(source_fp, "\n};\n");
}
-void printLenArray(FILE* source_fp, const vector<sal_Int32>& lenArray,
- sal_Int32 count)
+void printLenArray(FILE* source_fp, const vector<sal_Int32>& lenArray)
{
fprintf(source_fp, "static const sal_Int32 lenArray[] = {\n\t");
- count = 1;
+ sal_Int32 count = 1;
fprintf(source_fp, "0x%x, ", 0); // insert one slat for skipping 0 in index2 array.
for (size_t k = 0; k < lenArray.size(); k++)
{
@@ -207,11 +203,10 @@ void printLenArray(FILE* source_fp, const vector<sal_Int32>& lenArray,
fprintf(source_fp, "\n};\n");
}
-void printIndex1(FILE *source_fp, sal_Int32 *charArray, sal_Int32 count,
- sal_Int16 *set)
+void printIndex1(FILE *source_fp, sal_Int32 *charArray, sal_Int16 *set)
{
fprintf (source_fp, "static const sal_Int16 index1[] = {\n\t");
- count = 0;
+ sal_Int32 count = 0;
sal_Int32 j;
for (sal_Int32 i = 0; i < 0x100; i++) {
for (j = 0; j < 0x100; j++)
@@ -252,9 +247,9 @@ void printIndex2(FILE *source_fp, sal_Int32 *charArray, sal_Int16 *set)
fprintf (source_fp, "\n};\n");
}
-void printExistMark(FILE *source_fp, sal_Bool *exists, sal_Int32 count)
+void printExistMark(FILE *source_fp, sal_Bool *exists)
{
- count = 0;
+ sal_Int32 count = 0;
fprintf (source_fp, "static const sal_uInt8 existMark[] = {\n\t");
for (sal_Int32 i = 0; i < 0x1FFF; i++) {
sal_uInt8 bit = 0;
--
1.7.1
From 905f8fecf95f73d5d20f27ac72ff07bc4c7ebb5c Mon Sep 17 00:00:00 2001
From: Kenneth Venken <kenneth.venken@gmail.com>
Date: Fri, 28 Jan 2011 00:14:53 +0100
Subject: [PATCH 6/8] readability changes
---
i18npool/source/breakiterator/gendict.cxx | 106 +++++++++++++++--------------
1 files changed, 55 insertions(+), 51 deletions(-)
diff --git a/i18npool/source/breakiterator/gendict.cxx b/i18npool/source/breakiterator/gendict.cxx
index 93a359b..3d0b627 100644
--- a/i18npool/source/breakiterator/gendict.cxx
+++ b/i18npool/source/breakiterator/gendict.cxx
@@ -41,6 +41,16 @@ using std::vector;
using namespace ::rtl;
+void printIncludes(FILE *source_fp);
+void initArrays(sal_Bool *exists, sal_Int32 *charArray);
+void printDataArea(FILE *dictionary_fp, FILE *source_fp, sal_Int32 *charArray,
+ vector<sal_Int32>& lenArray, sal_Bool *exists);
+void printLenArray(FILE *source_fp, const vector<sal_Int32>& lenArray);
+void printIndex1(FILE *source_fp, sal_Int32 *charArray, sal_Int16 *set);
+void printIndex2(FILE *source_fp, sal_Int32 *charArray, sal_Int16 *set);
+void printExistMark(FILE *source_fp, sal_Bool *exists);
+void printFunctions(FILE *source_fp);
+
/* Utility gendict:
"BreakIterator_CJK provides input string caching and dictionary searching for
@@ -58,58 +68,52 @@ using namespace ::rtl;
/Documentation/DevGuide/OfficeDev/Implementing_a_New_Locale - 27/01/2011)
*/
-void printIncludes(FILE *source_fp);
-void initArrays(sal_Bool *exists, sal_Int32 *charArray);
-void printDataArea(FILE *sfp, FILE *source_fp, sal_Int32 *charArray,
- vector<sal_Int32>& lenArray, sal_Bool *exists);
-void printLenArray(FILE *source_fp, const vector<sal_Int32>& lenArray);
-void printIndex1(FILE *source_fp, sal_Int32 *charArray, sal_Int16 *set);
-void printIndex2(FILE *source_fp, sal_Int32 *charArray, sal_Int16 *set);
-void printExistMark(FILE *source_fp, sal_Bool *exists);
-void printFunctions(FILE *source_fp);
-
SAL_IMPLEMENT_MAIN_WITH_ARGS(argc, argv)
{
- FILE *sfp, *cfp;
+ FILE *dictionary_fp, *source_fp;
- if (argc < 3) exit(-1);
+ if (argc < 3)
+ {
+ printf("2 arguments required: dictionary_file_name source_file_name");
+ exit(-1);
+ }
- sfp = fopen(argv[1], "rb"); // open the source file for read;
- if (sfp == NULL)
+ dictionary_fp = fopen(argv[1], "rb"); // open the source file for read;
+ if (dictionary_fp == NULL)
{
printf("Open the dictionary source file failed.");
return -1;
}
// create the C source file to write
- cfp = fopen(argv[2], "wb");
- if (cfp == NULL) {
- fclose(sfp);
+ source_fp = fopen(argv[2], "wb");
+ if (source_fp == NULL) {
+ fclose(dictionary_fp);
printf("Can't create the C source file.");
return -1;
}
- sal_Int32 lenArrayCurr = 0, charArray[0x10000];
+ sal_Int32 charArray[0x10000];
vector<sal_Int32> lenArray;
sal_Bool exist[0x10000];
sal_Int16 set[0x100];
initArrays( exist, charArray );
- printIncludes(cfp);
- fprintf(cfp, "extern \"C\" {\n");
- printDataArea(sfp, cfp, charArray, lenArray, exist);
- printLenArray(cfp, lenArray);
- printIndex1(cfp, charArray, set);
- printIndex2(cfp, charArray, set);
- printExistMark(cfp, exist);
- printFunctions(cfp);
- fprintf (cfp, "}\n");
+ printIncludes(source_fp);
+ fprintf(source_fp, "extern \"C\" {\n");
+ printDataArea(dictionary_fp, source_fp, charArray, lenArray, exist);
+ printLenArray(source_fp, lenArray);
+ printIndex1(source_fp, charArray, set);
+ printIndex2(source_fp, charArray, set);
+ printExistMark(source_fp, exist);
+ printFunctions(source_fp);
+ fprintf (source_fp, "}\n");
- fclose(sfp);
- fclose(cfp);
+ fclose(dictionary_fp);
+ fclose(source_fp);
return 0;
-} // End of main
+}
void initArrays(sal_Bool* exists, sal_Int32* charArray)
{
@@ -138,7 +142,7 @@ void printFunctions(FILE* source_fp)
fprintf (source_fp, "\tconst sal_Unicode* getDataArea() { return dataArea; }\n");
}
-void printDataArea(FILE *sfp, FILE *source_fp,
+void printDataArea(FILE *dictionary_fp, FILE *source_fp,
sal_Int32 *charArray,
vector<sal_Int32>& lenArray, sal_Bool *exists)
{
@@ -148,7 +152,7 @@ void printDataArea(FILE *sfp, FILE *source_fp,
sal_Int32 lenArrayCurr = 0;
sal_Unicode current = 0;
sal_Int32 count = 0;
- while (fgets(str, 1024, sfp)) {
+ while (fgets(str, 1024, dictionary_fp)) {
// input file is in UTF-8 encoding
// don't convert last new line character to Ostr.
OUString Ostr((const sal_Char *)str, strlen(str) - 1, RTL_TEXTENCODING_UTF8);
@@ -160,23 +164,23 @@ void printDataArea(FILE *sfp, FILE *source_fp,
Ostr.iterateCodePoints(&i, 1);
if (len == i) continue; // skip one character word
- if (*u != current) {
- if (*u < current)
- printf("u %x, current %x, count %d, lenArray.size() %d\n", *u, current,
- sal::static_int_cast<int>(count), sal::static_int_cast<int>(lenArray.size()));
- current = *u;
- charArray[current] = lenArray.size();
+ if (u[0] != current) {
+ if (u[0] < current)
+ printf("u %x, current %x, count %d, lenArray.size() %d\n", u[0], current,
+ sal::static_int_cast<int>(count),
sal::static_int_cast<int>(lenArray.size()));
+ current = u[0];
+ charArray[current] = lenArray.size();
}
lenArray.push_back(lenArrayCurr);
exists[u[0]] = sal_True;
for (i = 1; i < len; i++) { // start from second character,
- exists[u[i]] = sal_True; // since the first character is captured in charArray.
- lenArrayCurr++;
- if ((count++) % 0x10 == 0)
- fprintf(source_fp, "\n\t");
- fprintf(source_fp, "0x%04x, ", u[i]);
+ exists[u[i]] = sal_True; // since the first character is captured in charArray.
+ lenArrayCurr++;
+ if ((count++) % 0x10 == 0)
+ fprintf(source_fp, "\n\t");
+ fprintf(source_fp, "0x%04x, ", u[i]);
}
}
lenArray.push_back( lenArrayCurr ); // store last ending pointer
@@ -210,12 +214,12 @@ void printIndex1(FILE *source_fp, sal_Int32 *charArray, sal_Int16 *set)
sal_Int32 j;
for (sal_Int32 i = 0; i < 0x100; i++) {
for (j = 0; j < 0x100; j++)
- if (charArray[(i*0x100) + j] != 0)
- break;
+ if (charArray[(i*0x100) + j] != 0)
+ break;
fprintf(source_fp, "0x%02x, ", set[i] = (j < 0x100 ?
sal::static_int_cast<sal_Int16>(count++) : 0xff));
if ((i+1) % 0x10 == 0)
- fprintf (source_fp, "\n\t");
+ fprintf (source_fp, "\n\t");
}
fprintf (source_fp, "};\n");
}
@@ -231,7 +235,7 @@ void printIndex2(FILE *source_fp, sal_Int32 *charArray, sal_Int16 *set)
if (prev != 0 && charArray[k] == 0) {
for (k++; k < 0x10000; k++)
if (charArray[k] != 0)
- break;
+ break;
}
prev = charArray[(i*0x100) + j];
fprintf(
@@ -254,12 +258,12 @@ void printExistMark(FILE *source_fp, sal_Bool *exists)
for (sal_Int32 i = 0; i < 0x1FFF; i++) {
sal_uInt8 bit = 0;
for (sal_Int32 j = 0; j < 8; j++)
- if (exists[i * 8 + j])
- bit |= 1 << j;
+ if (exists[i * 8 + j])
+ bit |= 1 << j;
fprintf(source_fp, "0x%02x, ", bit);
if (count == 0xf) {
- count = 0;
- fprintf(source_fp, "\n\t");
+ count = 0;
+ fprintf(source_fp, "\n\t");
} else count++;
}
fprintf (source_fp, "\n};\n");
--
1.7.1
From 4d317d550bde15112ac2e312636146d02b2d3e03 Mon Sep 17 00:00:00 2001
From: Kenneth Venken <kenneth.venken@gmail.com>
Date: Fri, 28 Jan 2011 00:29:14 +0100
Subject: [PATCH 7/8] changed some loop constructs
---
i18npool/source/breakiterator/gendict.cxx | 16 +++++++---------
1 files changed, 7 insertions(+), 9 deletions(-)
diff --git a/i18npool/source/breakiterator/gendict.cxx b/i18npool/source/breakiterator/gendict.cxx
index 3d0b627..90e6f75 100644
--- a/i18npool/source/breakiterator/gendict.cxx
+++ b/i18npool/source/breakiterator/gendict.cxx
@@ -211,11 +211,10 @@ void printIndex1(FILE *source_fp, sal_Int32 *charArray, sal_Int16 *set)
{
fprintf (source_fp, "static const sal_Int16 index1[] = {\n\t");
sal_Int32 count = 0;
- sal_Int32 j;
for (sal_Int32 i = 0; i < 0x100; i++) {
- for (j = 0; j < 0x100; j++)
- if (charArray[(i*0x100) + j] != 0)
- break;
+ sal_Int32 j = 0;
+ while( j < 0x100 && charArray[(i*0x100) + j] == 0)
+ j++;
fprintf(source_fp, "0x%02x, ", set[i] = (j < 0x100 ?
sal::static_int_cast<sal_Int16>(count++) : 0xff));
if ((i+1) % 0x10 == 0)
@@ -232,11 +231,10 @@ void printIndex2(FILE *source_fp, sal_Int32 *charArray, sal_Int16 *set)
if (set[i] != 0xff) {
for (sal_Int32 j = 0; j < 0x100; j++) {
sal_Int32 k = (i*0x100) + j;
- if (prev != 0 && charArray[k] == 0) {
- for (k++; k < 0x10000; k++)
- if (charArray[k] != 0)
- break;
- }
+ if (prev != 0 )
+ while( charArray[k] == 0 && k < 0x10000 )
+ k++;
+
prev = charArray[(i*0x100) + j];
fprintf(
source_fp, "0x%lx, ",
--
1.7.1
From 8e9bff87e00f1324588323e3ff0a4e3779f6250f Mon Sep 17 00:00:00 2001
From: Kenneth Venken <kenneth.venken@gmail.com>
Date: Sun, 30 Jan 2011 00:00:38 +0100
Subject: [PATCH 8/8] more comments
---
i18npool/source/breakiterator/gendict.cxx | 53 ++++++++++++++++-------------
1 files changed, 29 insertions(+), 24 deletions(-)
diff --git a/i18npool/source/breakiterator/gendict.cxx b/i18npool/source/breakiterator/gendict.cxx
index 90e6f75..1b70f23 100644
--- a/i18npool/source/breakiterator/gendict.cxx
+++ b/i18npool/source/breakiterator/gendict.cxx
@@ -93,10 +93,10 @@ SAL_IMPLEMENT_MAIN_WITH_ARGS(argc, argv)
return -1;
}
- sal_Int32 charArray[0x10000];
- vector<sal_Int32> lenArray;
- sal_Bool exist[0x10000];
+ vector<sal_Int32> lenArray; // stores the word boundaries in DataArea
sal_Int16 set[0x100];
+ sal_Bool exist[0x10000]; // true if unicode character exists
+ sal_Int32 charArray[0x10000]; // keeps track where words beginning with a certain char are
stored in DataArea
initArrays( exist, charArray );
printIncludes(source_fp);
@@ -142,8 +142,7 @@ void printFunctions(FILE* source_fp)
fprintf (source_fp, "\tconst sal_Unicode* getDataArea() { return dataArea; }\n");
}
-void printDataArea(FILE *dictionary_fp, FILE *source_fp,
- sal_Int32 *charArray,
+void printDataArea(FILE *dictionary_fp, FILE *source_fp, sal_Int32 *charArray,
vector<sal_Int32>& lenArray, sal_Bool *exists)
{
// generate main dict. data array
@@ -162,7 +161,8 @@ void printDataArea(FILE *dictionary_fp, FILE *source_fp,
sal_Int32 i=0;
Ostr.iterateCodePoints(&i, 1);
- if (len == i) continue; // skip one character word
+ if (len == i)
+ continue; // skip one character word
if (u[0] != current) {
if (u[0] < current)
@@ -184,7 +184,6 @@ void printDataArea(FILE *dictionary_fp, FILE *source_fp,
}
}
lenArray.push_back( lenArrayCurr ); // store last ending pointer
-
charArray[current+1] = lenArray.size();
fprintf(source_fp, "\n};\n");
}
@@ -207,6 +206,9 @@ void printLenArray(FILE* source_fp, const vector<sal_Int32>& lenArray)
fprintf(source_fp, "\n};\n");
}
+/* FIXME?: what happens if in every range i there is at least one charArray != 0
+ => this will make index1[] = {0x00, 0x01, 0x02,... 0xfe, 0xff }
+ => then in index2, the last range will be ignored incorrectly */
void printIndex1(FILE *source_fp, sal_Int32 *charArray, sal_Int16 *set)
{
fprintf (source_fp, "static const sal_Int16 index1[] = {\n\t");
@@ -229,26 +231,28 @@ void printIndex2(FILE *source_fp, sal_Int32 *charArray, sal_Int16 *set)
sal_Int32 prev = 0;
for (sal_Int32 i = 0; i < 0x100; i++) {
if (set[i] != 0xff) {
- for (sal_Int32 j = 0; j < 0x100; j++) {
- sal_Int32 k = (i*0x100) + j;
- if (prev != 0 )
- while( charArray[k] == 0 && k < 0x10000 )
- k++;
-
- prev = charArray[(i*0x100) + j];
- fprintf(
- source_fp, "0x%lx, ",
- sal::static_int_cast< unsigned long >(
- k < 0x10000 ? charArray[k] + 1 : 0));
- if ((j+1) % 0x10 == 0)
+ for (sal_Int32 j = 0; j < 0x100; j++) {
+ sal_Int32 k = (i*0x100) + j;
+ if (prev != 0 )
+ while( charArray[k] == 0 && k < 0x10000 )
+ k++;
+
+ prev = charArray[(i*0x100) + j];
+ fprintf(
+ source_fp, "0x%lx, ",
+ sal::static_int_cast< unsigned long >(
+ k < 0x10000 ? charArray[k] + 1 : 0));
+ if ((j+1) % 0x10 == 0)
+ fprintf (source_fp, "\n\t");
+ }
fprintf (source_fp, "\n\t");
}
- fprintf (source_fp, "\n\t");
- }
}
fprintf (source_fp, "\n};\n");
}
+/* Generates a bitmask for the existance of sal_Unicode values in dictionary;
+ it packs 8 sal_Bool values in 1 sal_uInt8 */
void printExistMark(FILE *source_fp, sal_Bool *exists)
{
sal_Int32 count = 0;
@@ -256,13 +260,14 @@ void printExistMark(FILE *source_fp, sal_Bool *exists)
for (sal_Int32 i = 0; i < 0x1FFF; i++) {
sal_uInt8 bit = 0;
for (sal_Int32 j = 0; j < 8; j++)
- if (exists[i * 8 + j])
- bit |= 1 << j;
+ bit |= (exists[i * 8 + j]) << j;
+
fprintf(source_fp, "0x%02x, ", bit);
if (count == 0xf) {
count = 0;
fprintf(source_fp, "\n\t");
- } else count++;
+ } else
+ count++;
}
fprintf (source_fp, "\n};\n");
}
--
1.7.1
Context
- [Libreoffice] [PATCH] refactoring gendict · Kenneth Venken
Privacy Policy |
Impressum (Legal Info) |
Copyright information: Unless otherwise specified, all text and images
on this website are licensed under the
Creative Commons Attribution-Share Alike 3.0 License.
This does not include the source code of LibreOffice, which is
licensed under the Mozilla Public License (
MPLv2).
"LibreOffice" and "The Document Foundation" are
registered trademarks of their corresponding registered owners or are
in actual use as trademarks in one or more countries. Their respective
logos and icons are also subject to international copyright laws. Use
thereof is explained in our
trademark policy.