On Fri, 2012-05-11 at 22:22 +0100, Michael Meeks wrote:
Argh - and it's entirely possible that this breaks the
CVE-2010-3454-1.doc test on -3-5 - but it's rather too late to double
check that now; seems to pass on master though; most odd. Will poke
Monday.
Wow - this was a -really- 'fun' problem to nail ;-) it turns out that
simply walking the fat chain pollutes the state of the streams in a way
that is extraordinarily hard to unwind; ie. even just calling the
original makePageChainCache method (or sim.) would seek beyond the end
of the stream, putting it in some un-recoverable state (or somesuch).
Anyhow - after the big chunk of life working that out, it dawned on me
that there is no need for the pagechaincache building to be slow, and
that we should do it always, and incrementally as we read. Hopefully
that'll still allow us to recover parts of word documents that are not
seekable.
So - I re-worked this to simplify, incrementally build the page chain
cache which might help performance in nasty corner cases, and also wrote
some regression tests [ which are hairy, the sot/ 'pass' documents have
some nice instances of broken FAT chains ;-].
The 'slow.doc' parses in ~4 seconds for me now; though there is some
hyper-long and painfully incomprehensible 15 second thrash after that
(with no progress bar) still ;-)
I'd like to get this reviewed, more widely tested and into -3-5 (as yet
it's not in master either, this is vs. -3-5 ;-) It'd be worth testing
any documents we know of, where previously we could recover some of the
document content from the beginning of the stream, where the end was
corrupt (I guess).
Thanks !
Michael.
--
michael.meeks@suse.com <><, Pseudo Engineer, itinerant idiot
From fe0e7c2664a6fc9e3fae4404d41645cecc9e2624 Mon Sep 17 00:00:00 2001
From: Michael Meeks <michael.meeks@suse.com>
Date: Fri, 11 May 2012 20:19:21 +0100
Subject: [PATCH] sot: re-work OLE2 offset-to-page computation with a FAT
chain cache
---
sot/qa/cppunit/test_sot.cxx | 74 ++++++++++++++++++++-
sot/source/sdstor/stgstrms.cxx | 144 +++++++++++++++++++++++++++++-----------
sot/source/sdstor/stgstrms.hxx | 3 +
3 files changed, 180 insertions(+), 41 deletions(-)
diff --git a/sot/qa/cppunit/test_sot.cxx b/sot/qa/cppunit/test_sot.cxx
index 36d0b02..0f91bdf 100644
--- a/sot/qa/cppunit/test_sot.cxx
+++ b/sot/qa/cppunit/test_sot.cxx
@@ -33,6 +33,7 @@
#include <osl/file.hxx>
#include <osl/process.h>
#include <sot/storage.hxx>
+#include <sot/storinfo.hxx>
using namespace ::com::sun::star;
@@ -45,6 +46,11 @@ namespace
public:
SotTest() {}
+ bool checkStream( const SotStorageRef &xObjStor,
+ const String &rStreamName,
+ sal_uLong nSize );
+ bool checkStorage( const SotStorageRef &xObjStor );
+
virtual bool load(const rtl::OUString &,
const rtl::OUString &rURL, const rtl::OUString &);
@@ -55,12 +61,78 @@ namespace
CPPUNIT_TEST_SUITE_END();
};
+ bool SotTest::checkStream( const SotStorageRef &xObjStor,
+ const String &rStreamName,
+ sal_uLong nSize )
+ {
+ unsigned char *pData = (unsigned char*)malloc( nSize );
+ sal_uLong nReadableSize = 0;
+ if( !pData )
+ return true;
+
+ { // Read the data in one block
+ SotStorageStreamRef xStream( xObjStor->OpenSotStream( rStreamName ) );
+ xStream->Seek(0);
+ sal_uLong nRemaining = xStream->GetSize() - xStream->Tell();
+
+ CPPUNIT_ASSERT_MESSAGE( "check size", nRemaining == nSize );
+ CPPUNIT_ASSERT_MESSAGE( "check size #2", xStream->remainingSize() == nSize );
+
+ // Read as much as we can, a corrupted FAT chain can cause real grief here
+ nReadableSize = xStream->Read( (void *)pData, nSize );
+// fprintf(stderr, "readable size %d vs size %d remaining %d\n", nReadableSize, nSize,
nReadableSize);
+ }
+ { // Read the data backwards as well
+ SotStorageStreamRef xStream( xObjStor->OpenSotStream( rStreamName ) );
+ for( sal_uLong i = nReadableSize; i > 0; i-- )
+ {
+ CPPUNIT_ASSERT_MESSAGE( "sot reading error", !xStream->GetError() );
+ unsigned char c;
+ xStream->Seek( i - 1 );
+ CPPUNIT_ASSERT_MESSAGE( "sot storage reading byte",
+ xStream->Read( &c, 1 ) == 1);
+ CPPUNIT_ASSERT_MESSAGE( "mismatching data storage reading byte",
+ pData[i - 1] == c );
+ }
+ }
+
+ return true;
+ }
+
+ bool SotTest::checkStorage( const SotStorageRef &xObjStor )
+ {
+ SvStorageInfoList aInfoList;
+ xObjStor->FillInfoList( &aInfoList );
+
+ for( SvStorageInfoList::iterator aIt = aInfoList.begin();
+ aIt != aInfoList.end(); aIt++ )
+ {
+ sal_uLong nSize = aIt->GetSize();
+ fprintf( stderr, "Stream '%s' size %ld\n",
+ rtl::OUStringToOString( aIt->GetName(), RTL_TEXTENCODING_UTF8 ).getStr(),
+ (long)nSize );
+ if( aIt->IsStorage() )
+ {
+ SotStorageRef xChild( xObjStor->OpenSotStorage( aIt->GetName() ) );
+ checkStorage( xChild );
+ }
+ else if( aIt->IsStream() )
+ checkStream( xObjStor, aIt->GetName(), aIt->GetSize() );
+ }
+
+ return true;
+ }
+
bool SotTest::load(const rtl::OUString &,
const rtl::OUString &rURL, const rtl::OUString &)
{
SvFileStream aStream(rURL, STREAM_READ);
SotStorageRef xObjStor = new SotStorage(aStream);
- return xObjStor.Is() && !xObjStor->GetError();
+ if (!xObjStor.Is() && !xObjStor->GetError())
+ return false;
+
+ CPPUNIT_ASSERT_MESSAGE("sot storage is not valid", xObjStor->Validate());
+ return checkStorage (xObjStor);
}
void SotTest::test()
diff --git a/sot/source/sdstor/stgstrms.cxx b/sot/source/sdstor/stgstrms.cxx
index 5e6c1ea..22ff35f 100644
--- a/sot/source/sdstor/stgstrms.cxx
+++ b/sot/source/sdstor/stgstrms.cxx
@@ -26,12 +26,12 @@
*
************************************************************************/
+#include <algorithm>
#include <string.h> // memcpy()
-
+#include <sal/log.hxx>
#include <osl/file.hxx>
#include <tools/tempfile.hxx>
-#include <tools/debug.hxx>
#include "sot/stg.hxx"
#include "stgelem.hxx"
@@ -334,13 +334,46 @@ void StgStrm::SetEntry( StgDirEntry& r )
r.SetDirty();
}
+/*
+ * The page chain, is basically a singly linked list of slots each
+ * point to the next page. Instead of traversing the file structure
+ * for this each time build a simple flat in-memory vector list
+ * of pages.
+ */
+void StgStrm::scanBuildPageChainCache(sal_Int32 *pOptionalCalcSize)
+{
+ if (nSize > 0)
+ m_aPagesCache.reserve(nSize/nPageSize);
+
+ bool bError = false;
+ sal_Int32 nBgn = nStart;
+ sal_Int32 nOldBgn = -1;
+ sal_Int32 nOptSize = 0;
+ while( nBgn >= 0 && nBgn != nOldBgn )
+ {
+ if( nBgn >= 0 )
+ m_aPagesCache.push_back(nBgn);
+ nOldBgn = nBgn;
+ nBgn = pFat->GetNextPage( nBgn );
+ if( nBgn == nOldBgn )
+ bError = true;
+ nOptSize += nPageSize;
+ }
+ if (bError)
+ {
+ if (pOptionalCalcSize)
+ rIo.SetError( ERRCODE_IO_WRONGFORMAT );
+ m_aPagesCache.clear();
+ }
+ if (pOptionalCalcSize)
+ *pOptionalCalcSize = nOptSize;
+}
+
// Compute page number and offset for the given byte position.
// If the position is behind the size, set the stream right
// behind the EOF.
-
sal_Bool StgStrm::Pos2Page( sal_Int32 nBytePos )
{
- sal_Int32 nRel, nBgn;
// Values < 0 seek to the end
if( nBytePos < 0 || nBytePos >= nSize )
nBytePos = nSize;
@@ -353,41 +386,71 @@ sal_Bool StgStrm::Pos2Page( sal_Int32 nBytePos )
nPos = nBytePos;
if( nOld == nNew )
return sal_True;
- if( nNew > nOld )
- {
- // the new position is behind the current, so an incremental
- // positioning is OK. Set the page relative position
- nRel = nNew - nOld;
- nBgn = nPage;
- }
- else
+
+ // See fdo#47644 for a .doc with a vast amount of pages where seeking around the
+ // document takes a colossal amount of time
+ //
+ // Please Note: we build the pagescache incrementally as we go if necessary,
+ // so that a corrupted FAT doesn't poison the stream state for earlier reads
+ size_t nIdx = nNew / nPageSize;
+ if( nIdx >= m_aPagesCache.size() )
{
- // the new position is before the current, so we have to scan
- // the entire chain.
- nRel = nNew;
- nBgn = nStart;
+ // Extend the FAT cache ! ...
+ size_t nToAdd = nIdx + 1;
+
+ if (m_aPagesCache.empty())
+ m_aPagesCache.push_back( nStart );
+
+ nToAdd -= m_aPagesCache.size();
+
+ sal_Int32 nBgn = m_aPagesCache.back();
+// fprintf (stderr, "nIdx %d to-add %d start page %d pagesize %d\n",
+// (int)nIdx, (int)nToAdd, (int)nBgn, (int) nPageSize);
+
+ // Start adding pages while we can
+ while( nToAdd > 0 && nBgn >= 0 )
+ {
+ nBgn = pFat->GetNextPage( nBgn );
+// fprintf (stderr, "add page %d\n", (int)nBgn);
+ if( nBgn >= 0 )
+ {
+ m_aPagesCache.push_back( nBgn );
+ nToAdd--;
+ }
+ }
}
- // now, traverse the FAT chain.
- nRel /= nPageSize;
- sal_Int32 nLast = STG_EOF;
- while( nRel && nBgn >= 0 )
+
+ if ( nIdx > m_aPagesCache.size() )
{
- nLast = nBgn;
- nBgn = pFat->GetNextPage( nBgn );
- nRel--;
+ rIo.SetError( SVSTREAM_FILEFORMAT_ERROR );
+ nPage = STG_EOF;
+ nOffset = nPageSize;
+// fprintf (stderr, "returns over-end page %d offset %d err %d ret %d\n",
+// (int)nPage, (int)nOffset, (int) rIo.GetError(), false);
+ return sal_False;
}
// special case: seek to 1st byte of new, unallocated page
// (in case the file size is a multiple of the page size)
- if( nBytePos == nSize && nBgn == STG_EOF && !nRel && !nOffset )
- nBgn = nLast, nOffset = nPageSize;
- if( nBgn < 0 && nBgn != STG_EOF )
+ if( nBytePos == nSize && !nOffset && nIdx > 0 && nIdx == m_aPagesCache.size() )
{
- rIo.SetError( SVSTREAM_FILEFORMAT_ERROR );
- nBgn = STG_EOF;
+// fprintf (stderr, "this odd case\n");
+ nIdx--;
nOffset = nPageSize;
}
- nPage = nBgn;
- return sal_Bool( nRel == 0 && nPage >= 0 );
+ else if ( nIdx == m_aPagesCache.size() )
+ {
+ nPage = STG_EOF;
+// fprintf (stderr, "returns eof page %d offset %d err %d ret %d\n",
+// (int)nPage, (int)nOffset, (int) rIo.GetError(), false);
+ return sal_False;
+ }
+
+ nPage = m_aPagesCache[ nIdx ];
+
+// fprintf (stderr, "returns page %d offset %d err %d ret %d\n",
+// (int)nPage, (int)nOffset, (int) rIo.GetError(), nPage >= 0 );
+
+ return nPage >= 0;
}
// Retrieve the physical page for a given byte offset.
@@ -404,6 +467,8 @@ StgPage* StgStrm::GetPhysPage( sal_Int32 nBytePos, sal_Bool bForce )
sal_Bool StgStrm::Copy( sal_Int32 nFrom, sal_Int32 nBytes )
{
+ m_aPagesCache.clear();
+
sal_Int32 nTo = nStart;
sal_Int32 nPgs = ( nBytes + nPageSize - 1 ) / nPageSize;
while( nPgs-- )
@@ -430,6 +495,8 @@ sal_Bool StgStrm::Copy( sal_Int32 nFrom, sal_Int32 nBytes )
sal_Bool StgStrm::SetSize( sal_Int32 nBytes )
{
+ m_aPagesCache.clear();
+
// round up to page size
sal_Int32 nOld = ( ( nSize + nPageSize - 1 ) / nPageSize ) * nPageSize;
sal_Int32 nNew = ( ( nBytes + nPageSize - 1 ) / nPageSize ) * nPageSize;
@@ -528,6 +595,8 @@ sal_Int32 StgFATStrm::GetPage( short nOff, sal_Bool bMake, sal_uInt16 *pnMasterA
{
if( bMake )
{
+ m_aPagesCache.clear();
+
// create a new master page
nFAT = nMaxPage++;
pMaster = rIo.Copy( nFAT, STG_FREE );
@@ -582,6 +651,8 @@ sal_Int32 StgFATStrm::GetPage( short nOff, sal_Bool bMake, sal_uInt16 *pnMasterA
sal_Bool StgFATStrm::SetPage( short nOff, sal_Int32 nNewPage )
{
+ m_aPagesCache.clear();
+
sal_Bool bRes = sal_True;
if( nOff < rIo.aHdr.GetFAT1Size() )
rIo.aHdr.SetFATPage( nOff, nNewPage );
@@ -631,6 +702,8 @@ sal_Bool StgFATStrm::SetPage( short nOff, sal_Int32 nNewPage )
sal_Bool StgFATStrm::SetSize( sal_Int32 nBytes )
{
+ m_aPagesCache.clear();
+
// Set the number of entries to a multiple of the page size
short nOld = (short) ( ( nSize + ( nPageSize - 1 ) ) / nPageSize );
short nNew = (short) (
@@ -747,16 +820,7 @@ void StgDataStrm::Init( sal_Int32 nBgn, sal_Int32 nLen )
{
// determine the actual size of the stream by scanning
// the FAT chain and counting the # of pages allocated
- nSize = 0;
- sal_Int32 nOldBgn = -1;
- while( nBgn >= 0 && nBgn != nOldBgn )
- {
- nOldBgn = nBgn;
- nBgn = pFat->GetNextPage( nBgn );
- if( nBgn == nOldBgn )
- rIo.SetError( ERRCODE_IO_WRONGFORMAT );
- nSize += nPageSize;
- }
+ scanBuildPageChainCache( &nSize );
}
}
diff --git a/sot/source/sdstor/stgstrms.hxx b/sot/source/sdstor/stgstrms.hxx
index a6a0ad1..5a9558b 100644
--- a/sot/source/sdstor/stgstrms.hxx
+++ b/sot/source/sdstor/stgstrms.hxx
@@ -30,6 +30,7 @@
#define _STGSTRMS_HXX
#include <tools/stream.hxx>
+#include <vector>
class StgIo;
class StgStrm;
@@ -77,6 +78,8 @@ protected:
sal_Int32 nPage; // current logical page
short nOffset; // offset into current page
short nPageSize; // logical page size
+ std::vector<sal_Int32> m_aPagesCache;
+ void scanBuildPageChainCache(sal_Int32 *pOptionalCalcSize = NULL);
sal_Bool Copy( sal_Int32 nFrom, sal_Int32 nBytes );
StgStrm( StgIo& );
public:
--
1.7.7
Context
Privacy Policy |
Impressum (Legal Info) |
Copyright information: Unless otherwise specified, all text and images
on this website are licensed under the
Creative Commons Attribution-Share Alike 3.0 License.
This does not include the source code of LibreOffice, which is
licensed under the Mozilla Public License (
MPLv2).
"LibreOffice" and "The Document Foundation" are
registered trademarks of their corresponding registered owners or are
in actual use as trademarks in one or more countries. Their respective
logos and icons are also subject to international copyright laws. Use
thereof is explained in our
trademark policy.