[REVIEW 3-5] fdo#49208 ridiculous performance on certain .doc

Caolán McNamara <caolanm -AT- redhat.com>
Tue, 01 May 2012 16:43:33 +0100

The .doc has some paragraphs in it that are around the 64k length mark.
It doesn't *look* like that's the case because they end in vast quantity
of whitespace and writer implements the word-compatibility thing to let
empty spaces disappear off the right side of the page rather than wrap.

Turns out to be a good test case for our break iterators, couple of
problems, squashed together as attachment

We create an icu string twice, once to compare it to a cached one, and
the second time to replace the cache if different
http://cgit.freedesktop.org/libreoffice/core/commit/?id=268ec2e64f89eb39fd5f02688787cd6f53e948b5

Better to index into a string instead of copying when "character"
counting
http://cgit.freedesktop.org/libreoffice/core/commit/?id=fd4fe85329654883a0bf3304ad0aa8ef0bfde844

icu's string compare is very very slow when you just want to check for
equality
http://cgit.freedesktop.org/libreoffice/core/commit/?id=ccc47b3db3eae25cc11bb709416c0b61747ca89e

and assume that a space doesn't combine with anything in any exotic way
to form a single grapheme
http://cgit.freedesktop.org/libreoffice/core/commit/?id=ae716b07f7218fadf0143de1946cc9e0e2c08744

C.

From 2e9ee2040c52a4425f91be9b0083b78b25d662b3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Caol=C3=A1n=20McNamara?= <caolanm@redhat.com>
Date: Tue, 1 May 2012 16:30:52 +0100
Subject: [PATCH] Resolves: fdo#49208 performance problems with very long
 paragraphs

---
 .../source/breakiterator/breakiterator_unicode.cxx |   27 +++++++++++++++++--
 sw/inc/breakit.hxx                                 |    7 ++++-
 sw/source/core/bastyp/breakit.cxx                  |   19 +++++++++----
 sw/source/core/txtnode/txtedt.cxx                  |    2 +-
 4 files changed, 44 insertions(+), 11 deletions(-)

diff --git a/i18npool/source/breakiterator/breakiterator_unicode.cxx 
b/i18npool/source/breakiterator/breakiterator_unicode.cxx
index aa5e1d8..5e5f6d5 100644
--- a/i18npool/source/breakiterator/breakiterator_unicode.cxx
+++ b/i18npool/source/breakiterator/breakiterator_unicode.cxx
@@ -34,6 +34,7 @@
 #include <unicode/udata.h>
 #include <rtl/strbuf.hxx>
 #include <rtl/ustring.hxx>
+#include <string.h>
 
 U_CDECL_BEGIN
 extern const char OpenOffice_dat[];
@@ -94,6 +95,24 @@ class OOoRuleBasedBreakIterator : public RuleBasedBreakIterator {
 
 };
 
+namespace
+{
+    bool isEqual(const UnicodeString &rOne, const rtl::OUString &rOther)
+    {
+        sal_Int32 nLength = rOne.length();
+        if (nLength != rOther.getLength())
+            return false;
+
+        //fdo#49208 operator== is implemented by compareTo etc in icu which is
+        //horrifically slow when all you want to know is that they're the same
+        //or not
+        const UChar *pOne = rOne.getBuffer();
+        // UChar != sal_Unicode in MinGW
+        const UChar *pOther = reinterpret_cast<const UChar *>(rOther.getStr());
+        return memcmp(pOne, pOther, nLength * sizeof(UChar)) == 0;
+    }
+}
+
 // loading ICU breakiterator on demand.
 void SAL_CALL BreakIterator_Unicode::loadICUBreakIterator(const com::sun::star::lang::Locale& 
rLocale,
         sal_Int16 rBreakType, sal_Int16 rWordType, const sal_Char *rule, const OUString& rText) 
throw(uno::RuntimeException)
@@ -199,13 +218,15 @@ void SAL_CALL BreakIterator_Unicode::loadICUBreakIterator(const 
com::sun::star::
         }
     }
 
-    if (newBreak || icuBI->aICUText.compare(UnicodeString(reinterpret_cast<const UChar 
*>(rText.getStr()), rText.getLength()))) {   // UChar != sal_Unicode in MinGW
-        icuBI->aICUText=UnicodeString(reinterpret_cast<const UChar *>(rText.getStr()), 
rText.getLength());
+    if (newBreak || !isEqual(icuBI->aICUText, rText))
+    {
+        // UChar != sal_Unicode in MinGW
+        const UChar *pText = reinterpret_cast<const UChar *>(rText.getStr());
+        icuBI->aICUText=UnicodeString(pText, rText.getLength());
         icuBI->aBreakIterator->setText(icuBI->aICUText);
     }
 }
 
-
 sal_Int32 SAL_CALL BreakIterator_Unicode::nextCharacters( const OUString& Text,
         sal_Int32 nStartPos, const lang::Locale &rLocale,
         sal_Int16 nCharacterIteratorMode, sal_Int32 nCount, sal_Int32& nDone )
diff --git a/sw/inc/breakit.hxx b/sw/inc/breakit.hxx
index 3075fc9..0bdce8c 100644
--- a/sw/inc/breakit.hxx
+++ b/sw/inc/breakit.hxx
@@ -112,7 +112,12 @@ public:
     sal_uInt16 GetRealScriptOfText( const String& rTxt, xub_StrLen nPos ) const;
     sal_uInt16 GetAllScriptsOfText( const String& rTxt ) const;
 
-    sal_Int32 getGraphemeCount(const rtl::OUString& rStr) const;
+    sal_Int32 getGraphemeCount(const rtl::OUString& rStr,
+        sal_Int32 nStart, sal_Int32 nEnd) const;
+    sal_Int32 getGraphemeCount(const rtl::OUString& rStr) const
+    {
+        return getGraphemeCount(rStr, 0, rStr.getLength());
+    }
 };
 
 #define SW_BREAKITER()  SwBreakIt::Get()
diff --git a/sw/source/core/bastyp/breakit.cxx b/sw/source/core/bastyp/breakit.cxx
index bb93660..c35afa0 100644
--- a/sw/source/core/bastyp/breakit.cxx
+++ b/sw/source/core/bastyp/breakit.cxx
@@ -168,16 +168,23 @@ sal_uInt16 SwBreakIt::GetAllScriptsOfText( const String& rTxt ) const
     return nRet;
 }
 
-sal_Int32 SwBreakIt::getGraphemeCount(const rtl::OUString& rText) const
+sal_Int32 SwBreakIt::getGraphemeCount(const rtl::OUString& rText, sal_Int32 nStart, sal_Int32 
nEnd) const
 {
     sal_Int32 nGraphemeCount = 0;
 
-    sal_Int32 nCurPos = 0;
-    while (nCurPos < rText.getLength())
+    sal_Int32 nCurPos = nStart;
+    while (nCurPos < nEnd)
     {
-        sal_Int32 nCount2 = 1;
-        nCurPos = xBreak->nextCharacters(rText, nCurPos, lang::Locale(),
-            i18n::CharacterIteratorMode::SKIPCELL, nCount2, nCount2);
+        //fdo#49208 cheat and assume that nothing can combine with a space
+        //to form a single grapheme
+        if (rText[nCurPos] == ' ')
+            ++nCurPos;
+        else
+        {
+            sal_Int32 nCount2 = 1;
+            nCurPos = xBreak->nextCharacters(rText, nCurPos, lang::Locale(),
+                i18n::CharacterIteratorMode::SKIPCELL, nCount2, nCount2);
+        }
         ++nGraphemeCount;
     }
 
diff --git a/sw/source/core/txtnode/txtedt.cxx b/sw/source/core/txtnode/txtedt.cxx
index 86e4b69..186d4ea 100644
--- a/sw/source/core/txtnode/txtedt.cxx
+++ b/sw/source/core/txtnode/txtedt.cxx
@@ -1863,7 +1863,7 @@ void SwTxtNode::CountWords( SwDocStat& rStat,
         }
     }
 
-    nTmpChars = pBreakIt->getGraphemeCount(aExpandText.copy(nExpandBegin, nExpandEnd - 
nExpandBegin));
+    nTmpChars = pBreakIt->getGraphemeCount(aExpandText, nExpandBegin, nExpandEnd);
     nTmpChars -= nNumOfMaskedChars;
 
     // no nTmpCharsExcludingSpaces adjust needed neither for blanked out MaskedChars
-- 
1.7.7.6

Context

[REVIEW 3-5] fdo#49208 ridiculous performance on certain .doc · Caolán McNamara
- Re: [REVIEW 3-5] fdo#49208 ridiculous performance on certain .doc · Muhammad Haggag
  - Re: [REVIEW 3-5] fdo#49208 ridiculous performance on certain .doc · Caolán McNamara
- [PUSHED 3-5] fdo#49208 ridiculous performance on certain .doc · Michael Meeks
  - Re: [PUSHED 3-5] fdo#49208 ridiculous performance on certain .doc · Caolán McNamara

Privacy Policy | Impressum (Legal Info) | Copyright information: Unless otherwise specified, all text and images on this website are licensed under the Creative Commons Attribution-Share Alike 3.0 License. This does not include the source code of LibreOffice, which is licensed under the Mozilla Public License (MPLv2). "LibreOffice" and "The Document Foundation" are registered trademarks of their corresponding registered owners or are in actual use as trademarks in one or more countries. Their respective logos and icons are also subject to international copyright laws. Use thereof is explained in our trademark policy.