Solved

Using IFilter With Delphi

Posted on 2002-04-25
15
2,325 Views
Last Modified: 2010-07-16
I'm working on a Delphi application which needs to be able to work with a COM DLL.  The one particular is Adobe IFilter (http://www.adobe.com/support/downloads/detail.jsp?ftpID=1276) which was written for Microsoft Indexing Service by Adobe.  The methods for this filter can be found on Microsoft's web site (http://msdn.microsoft.com/library/default.asp?url=/library/en-us/indexsrv/ixrefint_9sfm.asp).

I will provide the points to whoever can provide me with Delphi source code for using the IFilter.  I'm looking to extract text from PDF files.

It must be the IFilter because later on I will be using other IFilters as well.
0
Comment
Question by:dokken
15 Comments
 

Expert Comment

by:escaper
Comment Utility
listen
0
 
LVL 26

Expert Comment

by:Russell Libby
Comment Utility

Dokken,
Not sure if you have the header files (ntquery/filter/filtererr) so I am including them. I converted them from C by hand, and ntquery is not fully complete, but it has enough to cover what your after.
At the bottom is a rough (little error checking, etc..) example of how to use the filter

-------------------------------------------------------
unit ntquery;

interface

uses
  Windows, SysUtils, ActiveX;

//
// Use this path for the null catalog, one that doesn't have an index.
// Use it to search for properties of files that are not indexed.
//
const
  CINULLCATALOG     =  '::_noindex_::';

//
// Minimal support for persistent handlers.
//
function   LoadIFilter(pwcsPath: PWideChar; pUnkOuter: IUnknown; var ppIUnk): HResult; stdcall external 'query.dll';
function   BindIFilterFromStorage(pStg: IStorage; pUnkOuter: IUnknown; var ppIUnk): HResult; stdcall external 'query.dll';
function   BindIFilterFromStream(pStm: IStream; pUnkOuter: IUnknown; var ppIUnk): HResult; stdcall external 'query.dll';
function   LocateCatalogsW(pwszScope: PWideChar; iBmk: ULONG; pwszMachine: PWideChar; pccMachine: ULONG; pwszCat: PWideChar; pccCat: ULONG): HResult; stdcall external 'query.dll';

//
// For calling from VB
//
function   LocateCatalogsA(pwszScope: PChar; iBmk: ULONG; pwszMachine: PChar; pccMachine: ULONG; pwszCat: PChar; pccCat: ULONG): HResult; stdcall external 'query.dll';

// The Index Server Data Source Object CLSID
const
  CLSID_INDEX_SERVER_DSO: TGUID =  '{D7A2B01A-A47D-11D0-8C55-00C04FC2DB8D}';

// The storage property set
const
  PSGUID_STORAGE:         TGUID =  '{B725F130-47EF-101A-A5F1-02608C9EEBAC}';

//#define PID_STG_DICTIONARY            ((PROPID) 0x00000000) //reserved
//#define PID_STG_CODEPAGE              ((PROPID) 0x00000001) //reserved
const
  PID_STG_DIRECTORY          =  $00000002;
  PID_STG_CLASSID            =  $00000003;
  PID_STG_STORAGETYPE        =  $00000004;
  PID_STG_VOLUME_ID          =  $00000005;
  PID_STG_PARENT_WORKID      =  $00000006;
  // unused #define PID_STG_              ((PROPID) 0x00000007)
  PID_STG_FILEINDEX          =  $00000008;
  PID_STG_LASTCHANGEUSN      =  $00000009;
  PID_STG_NAME               =  $0000000A;
  PID_STG_PATH               =  $0000000B;
  PID_STG_SIZE               =  $0000000C;
  PID_STG_ATTRIBUTES         =  $0000000D;
  PID_STG_WRITETIME          =  $0000000E;
  PID_STG_CREATETIME         =  $0000000F;
  PID_STG_ACCESSTIME         =  $00000010;
  // unused #define PID_STG_              ((PROPID) 0x00000011)
  PID_STG_ALLOCSIZE          =  $00000012;
  PID_STG_CONTENTS           =  $00000013;
  PID_STG_SHORTNAME          =  $00000014;
  PID_STG_MAX                =  PID_STG_SHORTNAME;
  CSTORAGEPROPERTY           =  $15;

// File System Content Index Framework property set
const
  DBPROPSET_FSCIFRMWRK_EXT:  TGUID =  '{A9BD1526-6A80-11D0-8C9D-0020AF1D740E}';

const
  DBPROP_CI_CATALOG_NAME     =  2;
  DBPROP_CI_INCLUDE_SCOPES   =  3;
  DBPROP_CI_DEPTHS           =  4; // obsolete
  DBPROP_CI_SCOPE_FLAGS      =  4;
  DBPROP_CI_EXCLUDE_SCOPES   =  5;
  DBPROP_CI_SECURITY_ID      =  6;
  DBPROP_CI_QUERY_TYPE       =  7;

// Query Extension property set
const
  DBPROPSET_QUERYEXT:        TGUID =  '{A7AC77ED-F8D7-11CE-A798-0020F8008025}';

const
  DBPROP_USECONTENTINDEX        =  2;
  DBPROP_DEFERNONINDEXEDTRIMMING=  3;
  DBPROP_USEEXTENDEDDBTYPES     =  4;

// Content Index Framework Core property set
const
  DBPROPSET_CIFRMWRKCORE_EXT:   TGUID =  '{AFAFACA5-B5D1-11D0-8C62-00C04FC2DB8D}';

const
  DBPROP_MACHINE                =  2;
  DBPROP_CLIENT_CLSID           =  3;

// Scope flags
const
  QUERY_SHALLOW                 =  0;
  QUERY_DEEP                    =  1;
  QUERY_PHYSICAL_PATH           =  0;
  QUERY_VIRTUAL_PATH            =  2;

// query property set (PSGUID_QUERY) properties not defined in oledb.h
const
  PROPID_QUERY_WORKID           =  5;
  PROPID_QUERY_UNFILTERED       =  7;
  PROPID_QUERY_VIRTUALPATH      =  9;
  PROPID_QUERY_LASTSEENTIME     =  10;

implementation

end.
-------------------------------------------------------
unit filter;

interface

uses
  Windows, SysUtils, Classes, ActiveX;

type
  IFILTER_INIT                        =  TOleEnum;
const
  IFILTER_INIT_CANON_PARAGRAPHS            = 1;
      IFILTER_INIT_HARD_LINE_BREAKS            = 2;
      IFILTER_INIT_CANON_HYPHENS               = 4;
      IFILTER_INIT_CANON_SPACES               = 8;
      IFILTER_INIT_APPLY_INDEX_ATTRIBUTES      = 16;
      IFILTER_INIT_APPLY_OTHER_ATTRIBUTES      = 32;
      IFILTER_INIT_INDEXING_ONLY               = 64;
      IFILTER_INIT_SEARCH_LINKS               = 128;

type
  IFILTER_FLAGS                       =  TOleEnum;
const
  IFILTER_FLAGS_OLE_PROPERTIES            =  1;

type
  CHUNKSTATE                          =  TOleEnum;
const
  CHUNK_TEXT                              =  $01;
      CHUNK_VALUE                              =  $02;

type
  CHUNK_BREAKTYPE                     =  TOleEnum;
const
  CHUNK_NO_BREAK                           =  0;
      CHUNK_EOW                              =  1;
  CHUNK_EOS                           =  2;
      CHUNK_EOP                              =  3;
      CHUNK_EOC                              =  4;

type
  FILTERREGION      =  packed record
     idChunk:       ULONG;
     cwcStart:      ULONG;
     cwcExtent:     ULONG;
  end;
  tagFILTERREGION   =  FILTERREGION;


const
  PRSPEC_LPWSTR     =  0;
  PRSPEC_PROPID     =  1;

type
  PROPID            =  ULONG;

type
  PROPSPEC          =  packed record
     ulKind:        ULONG;
     case integer of
        0  :  (prid: PROPID);
        1  :  (lpws: PWideChar);
     end;
  tagPROPSPEC       =  PROPSPEC;

type
  FULLPROPSPEC      =  packed record
     guidPropSet:   TGUID;
     psProperty:    PROPSPEC;
  end;
  tagFULLPROPSPEC   =  FULLPROPSPEC;
  PFULLPROPSPEC     =  ^FULLPROPSPEC;
 
type
  STAT_CHUNK        =  packed record
     idChunk:       ULONG;
     breakType:     CHUNK_BREAKTYPE;
     flags:         CHUNKSTATE;
     locale:        LCID;
     attribute:     FULLPROPSPEC;
     idChunkSource: ULONG;
     cwcStartSource:ULONG;
     cwcLenSource:  ULONG;
  end;
  tagSTAT_CHUNK     =  STAT_CHUNK;

// From filtererr.h
#define FILTER_E_END_OF_CHUNKS           ((HRESULT)0x80041700L)

//
// MessageId: FILTER_E_NO_MORE_TEXT
//
// MessageText:
//
//  No more text available in chunk.
//
const
  FILTER_E_NO_MORE_TEXT               =  $80041701;

//
// MessageId: FILTER_E_NO_MORE_VALUES
//
// MessageText:
//
//  No more property values available in chunk.
//
const
  FILTER_E_NO_MORE_VALUES             =  $80041702;

//
// MessageId: FILTER_E_ACCESS
//
// MessageText:
//
//  Unable to access object.
//
const
  FILTER_E_ACCESS                     =  $80041703;

//
// MessageId: FILTER_W_MONIKER_CLIPPED
//
// MessageText:
//
//  Moniker doesn't cover entire region.
//
const
  FILTER_W_MONIKER_CLIPPED            =  $80041704;

//
// MessageId: FILTER_E_NO_TEXT
//
// MessageText:
//
//  No text in current chunk.
//
const
  FILTER_E_NO_TEXT                    =  $80041705;

//
// MessageId: FILTER_E_NO_VALUES
//
// MessageText:
//
//  No values in current chunk.
//
const
  FILTER_E_NO_VALUES                  =  $80041706;

//
// MessageId: FILTER_E_EMBEDDING_UNAVAILABLE
//
// MessageText:
//
//  Unable to bind IFilter for embedded object.
//
const
  FILTER_E_EMBEDDING_UNAVAILABLE      =  $80041707;

//
// MessageId: FILTER_E_LINK_UNAVAILABLE
//
// MessageText:
//
//  Unable to bind IFilter for linked object.
//
const
  FILTER_E_LINK_UNAVAILABLE           =  $80041708;

//
// MessageId: FILTER_S_LAST_TEXT
//
// MessageText:
//
//  This is the last text in the current chunk.
//
const
  FILTER_S_LAST_TEXT                  =  $00041709;

//
// MessageId: FILTER_S_LAST_VALUES
//
// MessageText:
//
//  This is the last value in the current chunk.
//
const
  FILTER_S_LAST_VALUES                =  $0004170A;

//
// MessageId: FILTER_E_PASSWORD
//
// MessageText:
//
//  File was not filtered due to password protection.
//
const
  FILTER_E_PASSWORD                   =  $8004170B;

//
// MessageId: FILTER_E_UNKNOWNFORMAT
//
// MessageText:
//
//  The document format is not recognized by the flter.
//
const
  FILTER_E_UNKNOWNFORMAT              =  $8004170C;


const
  IID_IFIlter:      TGUID =  '{89BCB740-6119-101A-BCB7-00DD010655AF}';

type
  IFilter = interface(IUnknown)
     ['{89BCB740-6119-101A-BCB7-00DD010655AF}']
     function Init(grfFlags: ULONG; cAttributes: ULONG; aAttributes: PFULLPROPSPEC; out pFlags: ULONG): HResult; stdcall;
     function GetChunk(out pStat: STAT_CHUNK): HResult; stdcall;
     function GetText(var pcwcBuffer: ULONG; awcBuffer: PWideChar): HResult; stdcall;
     function GetValue(out ppPropValue: PROPVARIANT): HResult; stdcall;
     function BindRegion(origPos: FILTERREGION; riid: TGUID; out ppUnk): HResult; stdcall;
  end;

implementation

end.
-------------------------------------------------------

Example:

procedure TForm1.Button1Click(Sender: TObject);
var  punk:       IUnknown;
     pfilt:      IFilter;
     chunk:      STAT_CHUNK;
     attr:       FULLPROPSPEC;
     pwc:        PWideChar;
     hret:       HResult;
     flags:      ULONG;
     c:          Cardinal;
begin

  pwc:=AllocMem(4096);
  // Replace with your adobe file name
  StringToWideChar('c:\adobe\acrobat.pdf', pwc, 2048);
  if LoadIFilter(pwc, nil, punk) = S_OK then
  begin
     if punk.QueryInterface(IFilter, pfilt) = S_OK then
     begin
        hret:=pfilt.Init(IFILTER_INIT_INDEXING_ONLY, 0, nil, flags);
        if hret = S_OK then
        begin
           hret:=pfilt.GetChunk(chunk);
           if hret = S_OK then
           begin
              c:=2048;
              while (hret = S_OK) do
              begin
                 c:=2048;
                 hret:=pfilt.GetText(c, pwc);
              end;
           end;
        end;
        pfilt:=nil;
     end;
     punk:=nil;
  end;
  FreeMem(pwc);

end;

-------------------------------------------------------

Hope this helps get you started

Russell

0
 
LVL 9

Expert Comment

by:ginsonic
Comment Utility
interested
0
 
LVL 3

Expert Comment

by:VSF
Comment Utility
Cool!
0
 

Author Comment

by:dokken
Comment Utility
Russell,

Were you able to test this using the Adobe PDF IFilter?  I created a sample application and added the two units (ntquery and filter) and the code to the button event on the main form but when I change query.dll to the ifilter's path and file name (d:\pdffilt.dll) I get an access violation when the program loads.
0
 
LVL 26

Expert Comment

by:Russell Libby
Comment Utility

Dokken

1.) Yes I did test this.
2.) ??? Why are you changing query.dll ???

This is an MS DLL that exposes the LoadIFilter() function. This function is NOT exposed in the pdffilt.dll. The purpose of this is to allow you to pass a filename, and have MS do the work of finding the persistent handler associated with the file, then loading and returning the IFilter to you. The only thing that you would have needed to change in the code I gave you was the .pdf file that you wanted to filter on.

If I misunderstood what you changed, let me know (post the code) and we'll figure it out.

Russell
0
 

Author Comment

by:dokken
Comment Utility
Russell,

I misunderstood how you were doing this.  I was looking to call the DLL directly.  Will your way work under Win98?  Does it require anything, like IIS server?

Also, how can I view the text it extracts?
0
Do You Know the 4 Main Threat Actor Types?

Do you know the main threat actor types? Most attackers fall into one of four categories, each with their own favored tactics, techniques, and procedures.

 
LVL 26

Expert Comment

by:Russell Libby
Comment Utility

Sorry Dokken,

I gave you one way of doing it; which allows you to pass any file name, and it will find the associated filter for it. But, the query.dll is part of the MS indexing services, and requires NT/Win2K/WinXP. If you just want to deal with the PDF files (on all Win versions), then the following is what you want:

1.) Do not include the ntquery.h (because of the static bind to query.dll)

2.) Define the following const
const
  PDFFilter:     TGUID = '{4C904448-74A9-11d0-AF6E-00C04FD8DC02}';

3.) Use the IPersistFile to load the PDF file:

Example:
-----------------------------------------------------------
procedure TForm1.Button1Click(Sender: TObject);
var  pfilt:      IFilter;
     ppfile:     IPersistFile;
     chunk:      STAT_CHUNK;
     pwc:        PWideChar;
     hret:       HResult;
     flags:      ULONG;
     c:          Cardinal;
     str:        String;
begin

  // Get the class instance for the pdf filter
  hret:=CoCreateInstance(PDFFilter, nil, CLSCTX_INPROC_SERVER or CLSCTX_INPROC_HANDLER, IFilter, pfilt);
  Assert(hret = S_OK, SysErrorMessage(hret));

  // Get the IPersistFile handler
  hret:=pfilt.QueryInterface(IPersistFile, ppfile);
  Assert(hret = S_OK, SysErrorMessage(hret));

  // Allocate memory for wide char
  pwc:=AllocMem(4096);

  // Try...Finally so we can clean up
  try

     // Load the file
     StringToWideChar('c:\adobe\acrobat.pdf', pwc, 2048);
     hret:=ppfile.Load(pwc, STGM_READ);
     Assert(hret = S_OK, SysErrorMessage(hret));

     // Init the filter
     hret:=pfilt.Init(IFILTER_INIT_INDEXING_ONLY, 0, nil, flags);
     Assert(hret = S_OK, SysErrorMessage(hret));

     // Get the first chunck
     hret:=pfilt.GetChunk(chunk);
     Assert(hret = S_OK, SysErrorMessage(hret));

     // Get all the text within the chunk
     while (hret = S_OK) do
     begin
        c:=2048;
        hret:=pfilt.GetText(c, pwc);
        if (hret = S_OK) or (hret = FILTER_S_LAST_TEXT) then
        begin
           str:=WideCharLenToString(pwc, c);
           ShowMessage(str);
        end;
     end;

  finally

     // Release the interfaces
     ppfile:=nil;
     pfilt:=nil;

     // Free the allocated memory
     FreeMem(pwc);

  end;

end;

-----------------------------------------------------------

Pay note to the GetText function. The cardinal is set to the size of the buffer (note: size in WideChar = allocated buffer size div 2) on input, and on return will hold the number of chars returned. Use WideCharLenToString to get the value as string. What you do with the text (string) is up to you.

As far as a flow of operation goes :
1.) Get the interface
2.) Load the file
3.) Init the filter
4.) Call GetChunk()
5.) Call GetText until Chunk is exhausted
6.) Call GetChunk again, if no more chunks then done
7.) Clean up

The above example *should* (can't verify as I only have a 2000 box) work on Win95 on up.

Hope this clears it all up, if not, let me know :-)

Russell




0
 

Author Comment

by:dokken
Comment Utility
Russell,

I removed ntquery and added const PDFFilter:     TGUID = '{4C904448-74A9-11d0-AF6E-00C04FD8DC02}'; but I'm unable to compile the test app.  It gives "Undeclared indentifier" errors at ppfile:     IPersistFile; and a few other locations.
0
 
LVL 26

Expert Comment

by:Russell Libby
Comment Utility

Dokken,

What version of Delphi are you compiling under? I just checked my D3 and D5 and both have IPersistFile defined in the ActiveX.pas unit. You shoud be including ActiveX as well as ComObj in this project. (As a general rule of thumb, most of your COM based projects will require the inclusion of these 2 units)

Russell
0
 

Author Comment

by:dokken
Comment Utility
Russell,

I'm using D5 so I must be doing something wrong.  How do I add the ActiveX and ComObj?
0
 
LVL 26

Accepted Solution

by:
Russell Libby earned 500 total points
Comment Utility

By placing the declarations in the "uses" section of your source file.

-----------------------------------------------------
unit {Whatever your unit name is};

interface

uses
   Windows, Messages, SysUtils, Classes, {Whatever is already there etc...}, ComObj, ActiveX;


0
 

Author Comment

by:dokken
Comment Utility
It works... thanks.  This solves a big problem.
0
 

Expert Comment

by:thbruns
Comment Utility
The link to the IFilter page at MSDN does not exists any longer. Could you please update this link?
Thanks a lot.
0
 
LVL 26

Expert Comment

by:EddieShipman
Comment Utility
This is the new link to the IFilter interface page:
http://msdn.microsoft.com/en-us/library/ms691105%28VS.85%29.aspx
0

Featured Post

IT, Stop Being Called Into Every Meeting

Highfive is so simple that setting up every meeting room takes just minutes and every employee will be able to start or join a call from any room with ease. Never be called into a meeting just to get it started again. This is how video conferencing should work!

Join & Write a Comment

Suggested Solutions

Title # Comments Views Activity
Display a PDF file and get words from it 6 96
Thread safe  opinion 7 110
Help on project with Soap 10 42
Delphi Form ownership 4 50
Introduction The parallel port is a very commonly known port, it was widely used to connect a printer to the PC, if you look at the back of your computer, for those who don't have newer computers, there will be a port with 25 pins and a small print…
In this tutorial I will show you how to use the Windows Speech API in Delphi. I will only cover basic functions such as text to speech and controlling the speed of the speech. SAPI Installation First you need to install the SAPI type library, th…
Internet Business Fax to Email Made Easy - With eFax Corporate (http://www.enterprise.efax.com), you'll receive a dedicated online fax number, which is used the same way as a typical analog fax number. You'll receive secure faxes in your email, fr…
Access reports are powerful and flexible. Learn how to create a query and then a grouped report using the wizard. Modify the report design after the wizard is done to make it look better. There will be another video to explain how to put the final p…

744 members asked questions and received personalized solutions in the past 7 days.

Join the community of 500,000 technology professionals and ask your questions.

Join & Ask a Question

Need Help in Real-Time?

Connect with top rated Experts

15 Experts available now in Live!

Get 1:1 Help Now