?
Solved

Accessing a Microsoft Office iFilter.

Posted on 2003-03-03
10
Medium Priority
?
1,522 Views
Last Modified: 2012-06-27
Dear Experts,

I have found code on this site which accesses iFilters from Adobe, but I cannot read anything from a Microsoft Office Filter (OffFilt.Dll)

Can anyone get this code to work with the office ifilters?

Note, I did not write all of this code, it was found
on this site but it does work for Adobe PDF documents
which are not Secured against indexing. I cannot get it
to work with microsoft word documents.


Here is the code declarations:


//-------------------------------------------------------
// unit1.pas:
//-------------------------------------------------------


unit Unit1;

interface

uses
  Windows, Messages, SysUtils, Classes, Graphics, Controls, Forms, Dialogs,
  StdCtrls , filter , ntquery ,  ComObj, ActiveX;

type
  TForm1 = class(TForm)
    Button2: TButton;
    Label1: TLabel;
    Memo1: TMemo;
    OpenDialog1: TOpenDialog;
    Button1: TButton;
    procedure Button2Click(Sender: TObject);
    procedure FormCreate(Sender: TObject);
    procedure Button1Click(Sender: TObject);
  private
    { Private declarations }
  public
    { Public declarations }
  end;

var
  Form1      : TForm1;
  MyFileName : String ;


implementation

{$R *.DFM}



procedure TForm1.Button2Click(Sender: TObject);
var  punknown:       IUnknown;
     pfilt:      IFilter;
     chunk:      STAT_CHUNK;
     attr:       FULLPROPSPEC;
     pwc:        PWideChar;
     hret:       HResult;
     flags:      ULONG;
     c:          Cardinal;
     str:        String;
     str1:       String;
     i:          longint;

begin
  Memo1.text := '';

 pwc:=AllocMem(4096);
 // Replace with your adobe file name

 ShowMessage('using:' + MyFileName);
 StringToWideChar(
    MyFileName,
    pwc,
    2048);

 if LoadIFilter(pwc, nil, punknown) = S_OK then
 begin
    if punknown.QueryInterface(IFilter, pfilt) = S_OK then
    begin
       hret:=pfilt.Init(IFILTER_INIT_INDEXING_ONLY, 0, nil, flags);
       if hret = S_OK then
       begin
          hret:=pfilt.GetChunk(chunk);
          if hret = -2147215616 then begin //7FFBE900
            ShowMessage('Could not get chunk');
          end ;
          while (hret = S_OK) do
          begin
             c:=2048;
             while (hret = S_OK) do
             begin
                c:=2048;
                hret:=pfilt.GetText(c, pwc);
                str:=WideCharLenToString(pwc, c);
                Memo1.text       := memo1.text + str ;
                Application.ProcessMessages;

                Caption := 'processing:'  ;
                i:=i+1;
             end;
             hret:=pfilt.GetChunk(chunk);
          end;
       end;
       pfilt:=nil;
    end;
    punknown:=nil;
 end;
 FreeMem(pwc);

 Caption:='completed processing';

end;

procedure TForm1.FormCreate(Sender: TObject);
begin
memo1.text:='';

end;

procedure TForm1.Button1Click(Sender: TObject);
begin
  OpenDialog1.Execute;
  MyFileName := opendialog1.filename ;
  Caption := 'Filename set:' + MyFileName ;
end;

end.



//-------------------------------------------------
//  filter.pas
//-------------------------------------------------

unit filter;

interface

uses
 Windows, SysUtils, Classes, ActiveX;

type
 IFILTER_INIT                        =  TOleEnum;
const
     IFILTER_INIT_CANON_PARAGRAPHS           = 1;
     IFILTER_INIT_HARD_LINE_BREAKS           = 2;
     IFILTER_INIT_CANON_HYPHENS              = 4;
     IFILTER_INIT_CANON_SPACES               = 8;
     IFILTER_INIT_APPLY_INDEX_ATTRIBUTES     = 16;
     IFILTER_INIT_APPLY_OTHER_ATTRIBUTES     = 32;
     IFILTER_INIT_INDEXING_ONLY              = 64;
     IFILTER_INIT_SEARCH_LINKS               = 128;

type
 IFILTER_FLAGS                       =  TOleEnum;
const
 IFILTER_FLAGS_OLE_PROPERTIES           =  1;

type
 CHUNKSTATE                          =  TOleEnum;
const
 CHUNK_TEXT                             =  $01;
     CHUNK_VALUE                             =  $02;

type
 CHUNK_BREAKTYPE                     =  TOleEnum;
const
 CHUNK_NO_BREAK                          =  0;
     CHUNK_EOW                             =  1;
 CHUNK_EOS                           =  2;
     CHUNK_EOP                             =  3;
     CHUNK_EOC                             =  4;

type
 FILTERREGION      =  packed record
    idChunk:       ULONG;
    cwcStart:      ULONG;
    cwcExtent:     ULONG;
 end;
 tagFILTERREGION   =  FILTERREGION;


const
 PRSPEC_LPWSTR     =  0;
 PRSPEC_PROPID     =  1;

type
 PROPID            =  ULONG;

type
 PROPSPEC          =  packed record
    ulKind:        ULONG;
    case integer of
       0  :  (prid: PROPID);
       1  :  (lpws: PWideChar);
    end;
 tagPROPSPEC       =  PROPSPEC;

type
 FULLPROPSPEC      =  packed record
    guidPropSet:   TGUID;
    psProperty:    PROPSPEC;
 end;
 tagFULLPROPSPEC   =  FULLPROPSPEC;
 PFULLPROPSPEC     =  ^FULLPROPSPEC;

type
 STAT_CHUNK        =  packed record
    idChunk:       ULONG;
    breakType:     CHUNK_BREAKTYPE;
    flags:         CHUNKSTATE;
    locale:        LCID;
    attribute:     FULLPROPSPEC;
    idChunkSource: ULONG;
    cwcStartSource:ULONG;
    cwcLenSource:  ULONG;
 end;
 tagSTAT_CHUNK     =  STAT_CHUNK;

// From filtererr.h

const  FILTER_E_END_OF_CHUNKS          =  $80041700;  //No more text available in chunk.
const  FILTER_E_NO_MORE_TEXT           =  $80041701;  //No more property values available in chunk.
const  FILTER_E_NO_MORE_VALUES         =  $80041702;  //Unable to access object.
//
const  FILTER_E_ACCESS                     =  $80041703;
//  Moniker doesn't cover entire region.
const  FILTER_W_MONIKER_CLIPPED            =  $80041704;
//  No text in current chunk.
const  FILTER_E_NO_TEXT                    =  $80041705;
//  No values in current chunk.
const  FILTER_E_NO_VALUES                  =  $80041706;

//
// MessageId: FILTER_E_EMBEDDING_UNAVAILABLE
//
// MessageText:
//
//  Unable to bind IFilter for embedded object.
//
const
 FILTER_E_EMBEDDING_UNAVAILABLE      =  $80041707;

//
// MessageId: FILTER_E_LINK_UNAVAILABLE
//
// MessageText:
//
//  Unable to bind IFilter for linked object.
//
const
 FILTER_E_LINK_UNAVAILABLE           =  $80041708;

//
// MessageId: FILTER_S_LAST_TEXT
//
// MessageText:
//
//  This is the last text in the current chunk.
//
const
 FILTER_S_LAST_TEXT                  =  $00041709;

//
// MessageId: FILTER_S_LAST_VALUES
//
// MessageText:
//
//  This is the last value in the current chunk.
//
const  FILTER_S_LAST_VALUES                =  $0004170A;

//
// MessageId: FILTER_E_PASSWORD
//
// MessageText:
//
//  File was not filtered due to password protection.
//
const FILTER_E_PASSWORD =  $8004170B;   //2147751691

//
// MessageId: FILTER_E_UNKNOWNFORMAT
//
// MessageText:
//
//  The document format is not recognized by the flter.
//
const
 FILTER_E_UNKNOWNFORMAT              =  $8004170C;


const
 IID_IFIlter:      TGUID =  '{89BCB740-6119-101A-BCB7-00DD010655AF}';

type
 IFilter = interface(IUnknown)
    ['{89BCB740-6119-101A-BCB7-00DD010655AF}']
    function Init(grfFlags: ULONG; cAttributes: ULONG; aAttributes: PFULLPROPSPEC; out pFlags: ULONG): HResult; stdcall;
    function GetChunk(out pStat: STAT_CHUNK): HResult; stdcall;
    function GetText(var pcwcBuffer: ULONG; awcBuffer: PWideChar): HResult; stdcall;
    function GetValue(out ppPropValue: PROPVARIANT): HResult; stdcall;
    function BindRegion(origPos: FILTERREGION; riid: TGUID; out ppUnk): HResult; stdcall;
 end;

implementation

end.


//------------------------------------------------------
// ntquery.pas
//------------------------------------------------------


unit ntquery;

interface

uses
 Windows, SysUtils, ActiveX;

//
// Use this path for the null catalog, one that doesn't have an index.
// Use it to search for properties of files that are not indexed.
//
const
 CINULLCATALOG     =  '::_noindex_::';

//
// Minimal support for persistent handlers.
//
function   LoadIFilter(pwcsPath: PWideChar; pUnkOuter: IUnknown; var ppIUnk): HResult; stdcall external 'query.dll';
function   BindIFilterFromStorage(pStg: IStorage; pUnkOuter: IUnknown; var ppIUnk): HResult; stdcall external 'query.dll';
function   BindIFilterFromStream(pStm: IStream; pUnkOuter: IUnknown; var ppIUnk): HResult; stdcall external 'query.dll';
function   LocateCatalogsW(pwszScope: PWideChar; iBmk: ULONG; pwszMachine: PWideChar; pccMachine: ULONG; pwszCat: PWideChar; pccCat: ULONG): HResult; stdcall external 'query.dll';

//
// For calling from VB
//
function   LocateCatalogsA(pwszScope: PChar; iBmk: ULONG; pwszMachine: PChar; pccMachine: ULONG; pwszCat: PChar; pccCat: ULONG): HResult; stdcall external 'query.dll';

// The Index Server Data Source Object CLSID
const
 CLSID_INDEX_SERVER_DSO: TGUID =  '{D7A2B01A-A47D-11D0-8C55-00C04FC2DB8D}';

// The storage property set
const
 PSGUID_STORAGE:         TGUID =  '{B725F130-47EF-101A-A5F1-02608C9EEBAC}';

//#define PID_STG_DICTIONARY            ((PROPID) 0x00000000) //reserved
//#define PID_STG_CODEPAGE              ((PROPID) 0x00000001) //reserved
const
 PID_STG_DIRECTORY          =  $00000002;
 PID_STG_CLASSID            =  $00000003;
 PID_STG_STORAGETYPE        =  $00000004;
 PID_STG_VOLUME_ID          =  $00000005;
 PID_STG_PARENT_WORKID      =  $00000006;
 // unused #define PID_STG_              ((PROPID) 0x00000007)
 PID_STG_FILEINDEX          =  $00000008;
 PID_STG_LASTCHANGEUSN      =  $00000009;
 PID_STG_NAME               =  $0000000A;
 PID_STG_PATH               =  $0000000B;
 PID_STG_SIZE               =  $0000000C;
 PID_STG_ATTRIBUTES         =  $0000000D;
 PID_STG_WRITETIME          =  $0000000E;
 PID_STG_CREATETIME         =  $0000000F;
 PID_STG_ACCESSTIME         =  $00000010;
 // unused #define PID_STG_              ((PROPID) 0x00000011)
 PID_STG_ALLOCSIZE          =  $00000012;
 PID_STG_CONTENTS           =  $00000013;
 PID_STG_SHORTNAME          =  $00000014;
 PID_STG_MAX                =  PID_STG_SHORTNAME;
 CSTORAGEPROPERTY           =  $15;

// File System Content Index Framework property set
const
 DBPROPSET_FSCIFRMWRK_EXT:  TGUID =  '{A9BD1526-6A80-11D0-8C9D-0020AF1D740E}';

const
 DBPROP_CI_CATALOG_NAME     =  2;
 DBPROP_CI_INCLUDE_SCOPES   =  3;
 DBPROP_CI_DEPTHS           =  4; // obsolete
 DBPROP_CI_SCOPE_FLAGS      =  4;
 DBPROP_CI_EXCLUDE_SCOPES   =  5;
 DBPROP_CI_SECURITY_ID      =  6;
 DBPROP_CI_QUERY_TYPE       =  7;

// Query Extension property set
const
 DBPROPSET_QUERYEXT:        TGUID =  '{A7AC77ED-F8D7-11CE-A798-0020F8008025}';

const
 DBPROP_USECONTENTINDEX        =  2;
 DBPROP_DEFERNONINDEXEDTRIMMING=  3;
 DBPROP_USEEXTENDEDDBTYPES     =  4;

// Content Index Framework Core property set
const
 DBPROPSET_CIFRMWRKCORE_EXT:   TGUID =  '{AFAFACA5-B5D1-11D0-8C62-00C04FC2DB8D}';

const
 DBPROP_MACHINE                =  2;
 DBPROP_CLIENT_CLSID           =  3;

// Scope flags
const
 QUERY_SHALLOW                 =  0;
 QUERY_DEEP                    =  1;
 QUERY_PHYSICAL_PATH           =  0;
 QUERY_VIRTUAL_PATH            =  2;

// query property set (PSGUID_QUERY) properties not defined in oledb.h
const
 PROPID_QUERY_WORKID           =  5;
 PROPID_QUERY_UNFILTERED       =  7;
 PROPID_QUERY_VIRTUALPATH      =  9;
 PROPID_QUERY_LASTSEENTIME     =  10;

implementation

end.







0
Comment
Question by:delloro
[X]
Welcome to Experts Exchange

Add your voice to the tech community where 5M+ people just like you are talking about what matters.

  • Help others & share knowledge
  • Earn cash & points
  • Learn & ask questions
10 Comments
 
LVL 26

Expert Comment

by:Russell Libby
ID: 8065326

Delloro,

Is this the EE link that you picked up the code from?

http://www.experts-exchange.com/Programming/Programming_Languages/Delphi/Q_20293579.html

If so, then I can probably help you out. Could you please post what you have already attempted, and let me know if you want (a) to load the filter using MS's query.dll, or (b) want to load the filter yourself. You probably want (b), as it does not require the MS indexing services (which won't be installed on 95/98 boxes)

Russell
0
 
LVL 1

Author Comment

by:delloro
ID: 8065765
Yes, the above link is where I got the code from.

I have implemented (a) in the above code I think,
ideally I would like to get both versions working.
:), If it is extra work I can get my friend to give
you some more points if this becomes two questions.

Any help would be greatly appreciated even if we can
get one of the versions working... As you mentioned
(b) would be a nice solution.

Thanks



0
 
LVL 26

Expert Comment

by:Russell Libby
ID: 8065848
Give me an hour or so to look at it....
Russell
0
Independent Software Vendors: We Want Your Opinion

We value your feedback.

Take our survey and automatically be enter to win anyone of the following:
Yeti Cooler, Amazon eGift Card, and Movie eGift Card!

 
LVL 26

Accepted Solution

by:
Russell Libby earned 2000 total points
ID: 8067066

Well....
Using code that performs operation (b) shown below, I am able to get the filter to load, query the IPersistFile, load the document, and call the filter's Init method. But when I call GetChunk(), it always fails with FILTER_E_END_OF_CHUNKS (no more chuncks). I've tried it with .doc, .xls, etc... all the same result. Other filters work fine, for example the nlhtml.dll filter for html files. (and the Adobe filter). So, unfortunately, I'm not sure where the issue lies, or how to resolve it. As you have probably already found out, documentation on this subject is pretty sparse.

Perhaps others here may be able to help further.

Russell


// Uses ActiveX, ComObj, filter

const
  MSOFFFilter:      TGUID =  '{f07f3920-7b8c-11cf-9be8-00aa004b9986}';
  HTMLFilter:       TGUID =  '{e0ca5340-4534-11cf-b952-00aa0051fe20}';


var  pfilt:      IFilter;
     ppfile:     IPersistFile;
     chunk:      STAT_CHUNK;
     pwc:        PWideChar;
     hret:       HResult;
     flags:      ULONG;
     c:          Cardinal;
     str:        String;
begin

  // Get the class instance for the office filter
  hret:=CoCreateInstance(MSOFFFilter, nil, CLSCTX_INPROC_SERVER or CLSCTX_INPROC_HANDLER, IFilter, pfilt);
  Assert(hret = S_OK, SysErrorMessage(hret));

  // Allocate memory for wide char
  pwc:=AllocMem(4096);
  StringToWideChar('c:\my documents\modulelist.xls', pwc, 2048);

  // Get the IPersistFile handler
  hret:=pfilt.QueryInterface(IPersistFile, ppfile);
  Assert(hret = S_OK, SysErrorMessage(hret));
  hret:=ppfile.Load(pwc, STGM_READ);
  Assert(hret = S_OK, SysErrorMessage(hret));

  // Try...Finally so we can clean up
  try

     // Init the filter
     flags:=0;
     hret:=pfilt.Init(IFILTER_INIT_INDEXING_ONLY, 0, nil, flags);
     Assert(hret = S_OK, SysErrorMessage(hret));

     // Get the first chunck
     while True do
     begin
        hret:=pfilt.GetChunk(chunk);
        if (hret <> S_OK) then break;
        if (chunk.flags <> CHUNK_TEXT) then Continue;
        // Get all the text within the chunk
        while (hret = S_OK) do
        begin
           c:=2048;
           hret:=pfilt.GetText(c, pwc);
           if (hret = S_OK) or (hret = FILTER_S_LAST_TEXT) then
           begin
              str:=WideCharLenToString(pwc, c);
              ShowMessage(str);
           end;
        end;
     end;

  finally

     // Release the interfaces
     ppfile:=nil;
     pfilt:=nil;

     // Free the allocated memory
     FreeMem(pwc);

  end;

0
 
LVL 1

Author Comment

by:delloro
ID: 8067603
Thanks for your comment,

I noticed that the Adobe DLL's ifilter method
was to be apartment threaded. I think that the
OFFFIlT dll is 'both' threaded.

Possibly If i can find some more documentation
to assist you we might be able to solve this one
later.

I will leave the question open for the moment.

Thanks greatly for your time and support.
0
 
LVL 1

Author Comment

by:delloro
ID: 8067679
I have checked MSDN, apparently there are many
versions of OFFFILT.DLL depending on what version
of software you are using. eg. SQL Server etc.

I will check out some of these versions.

0
 
LVL 26

Expert Comment

by:Russell Libby
ID: 8067779

Sounds good...
(I still have no clue as to why the filter doesn't return any chuncks.... :-) The whole thing makes you want to sit down and write your own filter, doesn't it?? :-)....

Russell
0
 

Expert Comment

by:CleanupPing
ID: 9316952
delloro:
This old question needs to be finalized -- accept an answer, split points, or get a refund.  For information on your options, please click here-> http:/help/closing.jsp#1 
EXPERTS:
Post your closing recommendations!  No comment means you don't care.
0
 
LVL 5

Expert Comment

by:snehanshu
ID: 10033411
Hi!
No comment has been added lately and this question is therefore classified abandoned.

If asker wishes to close the question, then refer to
http://www.experts-exchange.com/help/closing.jsp

Otherwise, I will leave a recommendation in the Cleanup topic area that this question is:

Answered by: rllibby

Please leave any comments here within the next seven days. It is assumed that any participant not responding to this request is no longer interested in its final disposition.

PLEASE DO NOT ACCEPT THIS COMMENT AS AN ANSWER!

...Snehanshu
EE Cleanup Volunteer
0

Featured Post

Technology Partners: We Want Your Opinion!

We value your feedback.

Take our survey and automatically be enter to win anyone of the following:
Yeti Cooler, Amazon eGift Card, and Movie eGift Card!

Question has a verified solution.

If you are experiencing a similar issue, please ask a related question

Creating an auto free TStringList The TStringList is a basic and frequently used object in Delphi. On many occasions, you may want to create a temporary list, process some items in the list and be done with the list. In such cases, you have to…
Introduction Raise your hands if you were as upset with FireMonkey as I was when I discovered that there was no TListview.  I use TListView in almost all of my applications I've written, and I was not going to compromise by resorting to TStringGrid…
Michael from AdRem Software outlines event notifications and Automatic Corrective Actions in network monitoring. Automatic Corrective Actions are scripts, which can automatically run upon discovery of a certain undesirable condition in your network.…
This is my first video review of Microsoft Bookings, I will be doing a part two with a bit more information, but wanted to get this out to you folks.
Suggested Courses
Course of the Month10 days, 18 hours left to enroll

770 members asked questions and received personalized solutions in the past 7 days.

Join the community of 500,000 technology professionals and ask your questions.

Join & Ask a Question