Link to home
Start Free TrialLog in
Avatar of dokken
dokken

asked on

Using IFilter With Delphi

I'm working on a Delphi application which needs to be able to work with a COM DLL.  The one particular is Adobe IFilter (http://www.adobe.com/support/downloads/detail.jsp?ftpID=1276) which was written for Microsoft Indexing Service by Adobe.  The methods for this filter can be found on Microsoft's web site (http://msdn.microsoft.com/library/default.asp?url=/library/en-us/indexsrv/ixrefint_9sfm.asp).

I will provide the points to whoever can provide me with Delphi source code for using the IFilter.  I'm looking to extract text from PDF files.

It must be the IFilter because later on I will be using other IFilters as well.
Avatar of escaper
escaper

listen
Avatar of Russell Libby

Dokken,
Not sure if you have the header files (ntquery/filter/filtererr) so I am including them. I converted them from C by hand, and ntquery is not fully complete, but it has enough to cover what your after.
At the bottom is a rough (little error checking, etc..) example of how to use the filter

-------------------------------------------------------
unit ntquery;

interface

uses
  Windows, SysUtils, ActiveX;

//
// Use this path for the null catalog, one that doesn't have an index.
// Use it to search for properties of files that are not indexed.
//
const
  CINULLCATALOG     =  '::_noindex_::';

//
// Minimal support for persistent handlers.
//
function   LoadIFilter(pwcsPath: PWideChar; pUnkOuter: IUnknown; var ppIUnk): HResult; stdcall external 'query.dll';
function   BindIFilterFromStorage(pStg: IStorage; pUnkOuter: IUnknown; var ppIUnk): HResult; stdcall external 'query.dll';
function   BindIFilterFromStream(pStm: IStream; pUnkOuter: IUnknown; var ppIUnk): HResult; stdcall external 'query.dll';
function   LocateCatalogsW(pwszScope: PWideChar; iBmk: ULONG; pwszMachine: PWideChar; pccMachine: ULONG; pwszCat: PWideChar; pccCat: ULONG): HResult; stdcall external 'query.dll';

//
// For calling from VB
//
function   LocateCatalogsA(pwszScope: PChar; iBmk: ULONG; pwszMachine: PChar; pccMachine: ULONG; pwszCat: PChar; pccCat: ULONG): HResult; stdcall external 'query.dll';

// The Index Server Data Source Object CLSID
const
  CLSID_INDEX_SERVER_DSO: TGUID =  '{D7A2B01A-A47D-11D0-8C55-00C04FC2DB8D}';

// The storage property set
const
  PSGUID_STORAGE:         TGUID =  '{B725F130-47EF-101A-A5F1-02608C9EEBAC}';

//#define PID_STG_DICTIONARY            ((PROPID) 0x00000000) //reserved
//#define PID_STG_CODEPAGE              ((PROPID) 0x00000001) //reserved
const
  PID_STG_DIRECTORY          =  $00000002;
  PID_STG_CLASSID            =  $00000003;
  PID_STG_STORAGETYPE        =  $00000004;
  PID_STG_VOLUME_ID          =  $00000005;
  PID_STG_PARENT_WORKID      =  $00000006;
  // unused #define PID_STG_              ((PROPID) 0x00000007)
  PID_STG_FILEINDEX          =  $00000008;
  PID_STG_LASTCHANGEUSN      =  $00000009;
  PID_STG_NAME               =  $0000000A;
  PID_STG_PATH               =  $0000000B;
  PID_STG_SIZE               =  $0000000C;
  PID_STG_ATTRIBUTES         =  $0000000D;
  PID_STG_WRITETIME          =  $0000000E;
  PID_STG_CREATETIME         =  $0000000F;
  PID_STG_ACCESSTIME         =  $00000010;
  // unused #define PID_STG_              ((PROPID) 0x00000011)
  PID_STG_ALLOCSIZE          =  $00000012;
  PID_STG_CONTENTS           =  $00000013;
  PID_STG_SHORTNAME          =  $00000014;
  PID_STG_MAX                =  PID_STG_SHORTNAME;
  CSTORAGEPROPERTY           =  $15;

// File System Content Index Framework property set
const
  DBPROPSET_FSCIFRMWRK_EXT:  TGUID =  '{A9BD1526-6A80-11D0-8C9D-0020AF1D740E}';

const
  DBPROP_CI_CATALOG_NAME     =  2;
  DBPROP_CI_INCLUDE_SCOPES   =  3;
  DBPROP_CI_DEPTHS           =  4; // obsolete
  DBPROP_CI_SCOPE_FLAGS      =  4;
  DBPROP_CI_EXCLUDE_SCOPES   =  5;
  DBPROP_CI_SECURITY_ID      =  6;
  DBPROP_CI_QUERY_TYPE       =  7;

// Query Extension property set
const
  DBPROPSET_QUERYEXT:        TGUID =  '{A7AC77ED-F8D7-11CE-A798-0020F8008025}';

const
  DBPROP_USECONTENTINDEX        =  2;
  DBPROP_DEFERNONINDEXEDTRIMMING=  3;
  DBPROP_USEEXTENDEDDBTYPES     =  4;

// Content Index Framework Core property set
const
  DBPROPSET_CIFRMWRKCORE_EXT:   TGUID =  '{AFAFACA5-B5D1-11D0-8C62-00C04FC2DB8D}';

const
  DBPROP_MACHINE                =  2;
  DBPROP_CLIENT_CLSID           =  3;

// Scope flags
const
  QUERY_SHALLOW                 =  0;
  QUERY_DEEP                    =  1;
  QUERY_PHYSICAL_PATH           =  0;
  QUERY_VIRTUAL_PATH            =  2;

// query property set (PSGUID_QUERY) properties not defined in oledb.h
const
  PROPID_QUERY_WORKID           =  5;
  PROPID_QUERY_UNFILTERED       =  7;
  PROPID_QUERY_VIRTUALPATH      =  9;
  PROPID_QUERY_LASTSEENTIME     =  10;

implementation

end.
-------------------------------------------------------
unit filter;

interface

uses
  Windows, SysUtils, Classes, ActiveX;

type
  IFILTER_INIT                        =  TOleEnum;
const
  IFILTER_INIT_CANON_PARAGRAPHS            = 1;
      IFILTER_INIT_HARD_LINE_BREAKS            = 2;
      IFILTER_INIT_CANON_HYPHENS               = 4;
      IFILTER_INIT_CANON_SPACES               = 8;
      IFILTER_INIT_APPLY_INDEX_ATTRIBUTES      = 16;
      IFILTER_INIT_APPLY_OTHER_ATTRIBUTES      = 32;
      IFILTER_INIT_INDEXING_ONLY               = 64;
      IFILTER_INIT_SEARCH_LINKS               = 128;

type
  IFILTER_FLAGS                       =  TOleEnum;
const
  IFILTER_FLAGS_OLE_PROPERTIES            =  1;

type
  CHUNKSTATE                          =  TOleEnum;
const
  CHUNK_TEXT                              =  $01;
      CHUNK_VALUE                              =  $02;

type
  CHUNK_BREAKTYPE                     =  TOleEnum;
const
  CHUNK_NO_BREAK                           =  0;
      CHUNK_EOW                              =  1;
  CHUNK_EOS                           =  2;
      CHUNK_EOP                              =  3;
      CHUNK_EOC                              =  4;

type
  FILTERREGION      =  packed record
     idChunk:       ULONG;
     cwcStart:      ULONG;
     cwcExtent:     ULONG;
  end;
  tagFILTERREGION   =  FILTERREGION;


const
  PRSPEC_LPWSTR     =  0;
  PRSPEC_PROPID     =  1;

type
  PROPID            =  ULONG;

type
  PROPSPEC          =  packed record
     ulKind:        ULONG;
     case integer of
        0  :  (prid: PROPID);
        1  :  (lpws: PWideChar);
     end;
  tagPROPSPEC       =  PROPSPEC;

type
  FULLPROPSPEC      =  packed record
     guidPropSet:   TGUID;
     psProperty:    PROPSPEC;
  end;
  tagFULLPROPSPEC   =  FULLPROPSPEC;
  PFULLPROPSPEC     =  ^FULLPROPSPEC;
 
type
  STAT_CHUNK        =  packed record
     idChunk:       ULONG;
     breakType:     CHUNK_BREAKTYPE;
     flags:         CHUNKSTATE;
     locale:        LCID;
     attribute:     FULLPROPSPEC;
     idChunkSource: ULONG;
     cwcStartSource:ULONG;
     cwcLenSource:  ULONG;
  end;
  tagSTAT_CHUNK     =  STAT_CHUNK;

// From filtererr.h
#define FILTER_E_END_OF_CHUNKS           ((HRESULT)0x80041700L)

//
// MessageId: FILTER_E_NO_MORE_TEXT
//
// MessageText:
//
//  No more text available in chunk.
//
const
  FILTER_E_NO_MORE_TEXT               =  $80041701;

//
// MessageId: FILTER_E_NO_MORE_VALUES
//
// MessageText:
//
//  No more property values available in chunk.
//
const
  FILTER_E_NO_MORE_VALUES             =  $80041702;

//
// MessageId: FILTER_E_ACCESS
//
// MessageText:
//
//  Unable to access object.
//
const
  FILTER_E_ACCESS                     =  $80041703;

//
// MessageId: FILTER_W_MONIKER_CLIPPED
//
// MessageText:
//
//  Moniker doesn't cover entire region.
//
const
  FILTER_W_MONIKER_CLIPPED            =  $80041704;

//
// MessageId: FILTER_E_NO_TEXT
//
// MessageText:
//
//  No text in current chunk.
//
const
  FILTER_E_NO_TEXT                    =  $80041705;

//
// MessageId: FILTER_E_NO_VALUES
//
// MessageText:
//
//  No values in current chunk.
//
const
  FILTER_E_NO_VALUES                  =  $80041706;

//
// MessageId: FILTER_E_EMBEDDING_UNAVAILABLE
//
// MessageText:
//
//  Unable to bind IFilter for embedded object.
//
const
  FILTER_E_EMBEDDING_UNAVAILABLE      =  $80041707;

//
// MessageId: FILTER_E_LINK_UNAVAILABLE
//
// MessageText:
//
//  Unable to bind IFilter for linked object.
//
const
  FILTER_E_LINK_UNAVAILABLE           =  $80041708;

//
// MessageId: FILTER_S_LAST_TEXT
//
// MessageText:
//
//  This is the last text in the current chunk.
//
const
  FILTER_S_LAST_TEXT                  =  $00041709;

//
// MessageId: FILTER_S_LAST_VALUES
//
// MessageText:
//
//  This is the last value in the current chunk.
//
const
  FILTER_S_LAST_VALUES                =  $0004170A;

//
// MessageId: FILTER_E_PASSWORD
//
// MessageText:
//
//  File was not filtered due to password protection.
//
const
  FILTER_E_PASSWORD                   =  $8004170B;

//
// MessageId: FILTER_E_UNKNOWNFORMAT
//
// MessageText:
//
//  The document format is not recognized by the flter.
//
const
  FILTER_E_UNKNOWNFORMAT              =  $8004170C;


const
  IID_IFIlter:      TGUID =  '{89BCB740-6119-101A-BCB7-00DD010655AF}';

type
  IFilter = interface(IUnknown)
     ['{89BCB740-6119-101A-BCB7-00DD010655AF}']
     function Init(grfFlags: ULONG; cAttributes: ULONG; aAttributes: PFULLPROPSPEC; out pFlags: ULONG): HResult; stdcall;
     function GetChunk(out pStat: STAT_CHUNK): HResult; stdcall;
     function GetText(var pcwcBuffer: ULONG; awcBuffer: PWideChar): HResult; stdcall;
     function GetValue(out ppPropValue: PROPVARIANT): HResult; stdcall;
     function BindRegion(origPos: FILTERREGION; riid: TGUID; out ppUnk): HResult; stdcall;
  end;

implementation

end.
-------------------------------------------------------

Example:

procedure TForm1.Button1Click(Sender: TObject);
var  punk:       IUnknown;
     pfilt:      IFilter;
     chunk:      STAT_CHUNK;
     attr:       FULLPROPSPEC;
     pwc:        PWideChar;
     hret:       HResult;
     flags:      ULONG;
     c:          Cardinal;
begin

  pwc:=AllocMem(4096);
  // Replace with your adobe file name
  StringToWideChar('c:\adobe\acrobat.pdf', pwc, 2048);
  if LoadIFilter(pwc, nil, punk) = S_OK then
  begin
     if punk.QueryInterface(IFilter, pfilt) = S_OK then
     begin
        hret:=pfilt.Init(IFILTER_INIT_INDEXING_ONLY, 0, nil, flags);
        if hret = S_OK then
        begin
           hret:=pfilt.GetChunk(chunk);
           if hret = S_OK then
           begin
              c:=2048;
              while (hret = S_OK) do
              begin
                 c:=2048;
                 hret:=pfilt.GetText(c, pwc);
              end;
           end;
        end;
        pfilt:=nil;
     end;
     punk:=nil;
  end;
  FreeMem(pwc);

end;

-------------------------------------------------------

Hope this helps get you started

Russell

interested
Cool!
Avatar of dokken

ASKER

Russell,

Were you able to test this using the Adobe PDF IFilter?  I created a sample application and added the two units (ntquery and filter) and the code to the button event on the main form but when I change query.dll to the ifilter's path and file name (d:\pdffilt.dll) I get an access violation when the program loads.

Dokken

1.) Yes I did test this.
2.) ??? Why are you changing query.dll ???

This is an MS DLL that exposes the LoadIFilter() function. This function is NOT exposed in the pdffilt.dll. The purpose of this is to allow you to pass a filename, and have MS do the work of finding the persistent handler associated with the file, then loading and returning the IFilter to you. The only thing that you would have needed to change in the code I gave you was the .pdf file that you wanted to filter on.

If I misunderstood what you changed, let me know (post the code) and we'll figure it out.

Russell
Avatar of dokken

ASKER

Russell,

I misunderstood how you were doing this.  I was looking to call the DLL directly.  Will your way work under Win98?  Does it require anything, like IIS server?

Also, how can I view the text it extracts?

Sorry Dokken,

I gave you one way of doing it; which allows you to pass any file name, and it will find the associated filter for it. But, the query.dll is part of the MS indexing services, and requires NT/Win2K/WinXP. If you just want to deal with the PDF files (on all Win versions), then the following is what you want:

1.) Do not include the ntquery.h (because of the static bind to query.dll)

2.) Define the following const
const
  PDFFilter:     TGUID = '{4C904448-74A9-11d0-AF6E-00C04FD8DC02}';

3.) Use the IPersistFile to load the PDF file:

Example:
-----------------------------------------------------------
procedure TForm1.Button1Click(Sender: TObject);
var  pfilt:      IFilter;
     ppfile:     IPersistFile;
     chunk:      STAT_CHUNK;
     pwc:        PWideChar;
     hret:       HResult;
     flags:      ULONG;
     c:          Cardinal;
     str:        String;
begin

  // Get the class instance for the pdf filter
  hret:=CoCreateInstance(PDFFilter, nil, CLSCTX_INPROC_SERVER or CLSCTX_INPROC_HANDLER, IFilter, pfilt);
  Assert(hret = S_OK, SysErrorMessage(hret));

  // Get the IPersistFile handler
  hret:=pfilt.QueryInterface(IPersistFile, ppfile);
  Assert(hret = S_OK, SysErrorMessage(hret));

  // Allocate memory for wide char
  pwc:=AllocMem(4096);

  // Try...Finally so we can clean up
  try

     // Load the file
     StringToWideChar('c:\adobe\acrobat.pdf', pwc, 2048);
     hret:=ppfile.Load(pwc, STGM_READ);
     Assert(hret = S_OK, SysErrorMessage(hret));

     // Init the filter
     hret:=pfilt.Init(IFILTER_INIT_INDEXING_ONLY, 0, nil, flags);
     Assert(hret = S_OK, SysErrorMessage(hret));

     // Get the first chunck
     hret:=pfilt.GetChunk(chunk);
     Assert(hret = S_OK, SysErrorMessage(hret));

     // Get all the text within the chunk
     while (hret = S_OK) do
     begin
        c:=2048;
        hret:=pfilt.GetText(c, pwc);
        if (hret = S_OK) or (hret = FILTER_S_LAST_TEXT) then
        begin
           str:=WideCharLenToString(pwc, c);
           ShowMessage(str);
        end;
     end;

  finally

     // Release the interfaces
     ppfile:=nil;
     pfilt:=nil;

     // Free the allocated memory
     FreeMem(pwc);

  end;

end;

-----------------------------------------------------------

Pay note to the GetText function. The cardinal is set to the size of the buffer (note: size in WideChar = allocated buffer size div 2) on input, and on return will hold the number of chars returned. Use WideCharLenToString to get the value as string. What you do with the text (string) is up to you.

As far as a flow of operation goes :
1.) Get the interface
2.) Load the file
3.) Init the filter
4.) Call GetChunk()
5.) Call GetText until Chunk is exhausted
6.) Call GetChunk again, if no more chunks then done
7.) Clean up

The above example *should* (can't verify as I only have a 2000 box) work on Win95 on up.

Hope this clears it all up, if not, let me know :-)

Russell




Avatar of dokken

ASKER

Russell,

I removed ntquery and added const PDFFilter:     TGUID = '{4C904448-74A9-11d0-AF6E-00C04FD8DC02}'; but I'm unable to compile the test app.  It gives "Undeclared indentifier" errors at ppfile:     IPersistFile; and a few other locations.

Dokken,

What version of Delphi are you compiling under? I just checked my D3 and D5 and both have IPersistFile defined in the ActiveX.pas unit. You shoud be including ActiveX as well as ComObj in this project. (As a general rule of thumb, most of your COM based projects will require the inclusion of these 2 units)

Russell
Avatar of dokken

ASKER

Russell,

I'm using D5 so I must be doing something wrong.  How do I add the ActiveX and ComObj?
ASKER CERTIFIED SOLUTION
Avatar of Russell Libby
Russell Libby
Flag of United States of America image

Link to home
membership
This solution is only available to members.
To access this solution, you must be a member of Experts Exchange.
Start Free Trial
Avatar of dokken

ASKER

It works... thanks.  This solves a big problem.
The link to the IFilter page at MSDN does not exists any longer. Could you please update this link?
Thanks a lot.
This is the new link to the IFilter interface page:
http://msdn.microsoft.com/en-us/library/ms691105%28VS.85%29.aspx