Date: prev next · Thread: first prev next last
2017 Archives by date, by thread · List index


(1) If the input is assumed to be an arbitrary sequence of Unicode scalar values (i.e., may contain noncharacters, even despite the caveat that those should never be interchanged), the below invalidChar handling might want to also watch out for U+FFFE and U+FFFF. (See writeValueContent in configmgr/source/writemodfile.cxx. Somewhat oddly, XML 1.0 excludes those two noncharacters from its Char definition while not excluding any other Unicode noncharacters, U+FDD0..U+FDEF and U+nFFFE..U+nFFFF for n in 1..10 hex.)

On 02/28/2017 10:30 PM, Eike Rathke wrote:
commit baca2ec8d5a457512e25b499c3cacc7a66ca853f
Author: Eike Rathke <erack@redhat.com>
Date:   Tue Feb 28 22:14:08 2017 +0100

    FastSaxSerializer: SAL_WARN() when writing invalid XML characters

    This catches things for OOXML, that could be escaped using _xHHHH_

    Change-Id: I937f67dc5edd3c0e5727d74bebb736dc82bdc53d

diff --git a/sax/source/tools/fastserializer.cxx b/sax/source/tools/fastserializer.cxx
index 620fe68..a571829 100644
--- a/sax/source/tools/fastserializer.cxx
+++ b/sax/source/tools/fastserializer.cxx
@@ -101,6 +101,26 @@ namespace sax_fastparser {
         write( sOutput.getStr(), sOutput.getLength(), bEscape );
     }

+#if OSL_DEBUG_LEVEL > 0
+    /** Characters not allowed in XML 1.0
+        XML 1.1 would exclude only U+0000
+     */
+    bool invalidChar( char c )
+    {
+        if (static_cast<unsigned char>(c) >= 0x20)
+            return false;
+
+        switch (c)
+        {
+            case 0x09:
+            case 0x0a:
+            case 0x0d:
+                return false;
+        }
+        return true;
+    }
+#endif
+
     void FastSaxSerializer::write( const char* pStr, sal_Int32 nLen, bool bEscape )
     {
         if (nLen == -1)
@@ -112,6 +132,7 @@ namespace sax_fastparser {
             return;
         }

+        bool bGood = true;
         for (sal_Int32 i = 0; i < nLen; ++i)
         {
             char c = pStr[ i ];
@@ -124,9 +145,26 @@ namespace sax_fastparser {
                 case '"':   writeBytes( "&quot;", 6 );   break;
                 case '\n':  writeBytes( "&#10;", 5 );    break;
                 case '\r':  writeBytes( "&#13;", 5 );    break;
-                default:    writeBytes( &c, 1 );          break;
+                default:
+#if OSL_DEBUG_LEVEL > 0
+                            /* FIXME: we should escape such invalid characters
+                             * in the _xHHHH_ form OOXML uses. Note that also a
+                             * literal "_x0008_" would have to be escaped then
+                             * as _x005F_x0008_ (where only the leading '_' is
+                             * escaped as _x005F_). */
+                            if (invalidChar(pStr[i]))
+                            {
+                                bGood = false;
+                                // The SAL_WARN() for the single character is
+                                // issued in writeBytes(), just gather for the
+                                // SAL_WARN_IF() below.
+                            }
+#endif
+                            writeBytes( &c, 1 );         break;
             }
         }
+        SAL_WARN_IF( !bGood && nLen > 1, "sax", "in '" << OString(pStr,std::min<sal_Int32>(nLen,42)) << 
"'");
+        (void)bGood;
     }

     void FastSaxSerializer::endDocument()
@@ -496,6 +534,21 @@ namespace sax_fastparser {

     void FastSaxSerializer::writeBytes( const char* pStr, size_t nLen )
     {
+#if OSL_DEBUG_LEVEL > 0
+        {
+            bool bGood = true;
+            for (size_t i=0; i < nLen; ++i)
+            {
+                if (invalidChar(pStr[i]))
+                {
+                    bGood = false;
+                    SAL_WARN("sax", "FastSaxSerializer::writeBytes - illegal XML character 0x" <<
+                            std::hex << int(static_cast<unsigned char>(pStr[i])));
+                }
+            }
+            SAL_WARN_IF( !bGood && nLen > 1, "sax", "in '" << OString(pStr,std::min<sal_Int32>(nLen,42)) << 
"'");
+        }
+#endif
         maCachedOutputStream.writeBytes( reinterpret_cast<const sal_Int8*>(pStr), nLen );
     }



Context


Privacy Policy | Impressum (Legal Info) | Copyright information: Unless otherwise specified, all text and images on this website are licensed under the Creative Commons Attribution-Share Alike 3.0 License. This does not include the source code of LibreOffice, which is licensed under the Mozilla Public License (MPLv2). "LibreOffice" and "The Document Foundation" are registered trademarks of their corresponding registered owners or are in actual use as trademarks in one or more countries. Their respective logos and icons are also subject to international copyright laws. Use thereof is explained in our trademark policy.