dokken
asked on
Using IFilter With Delphi
I'm working on a Delphi application which needs to be able to work with a COM DLL. The one particular is Adobe IFilter (http://www.adobe.com/support/downloads/detail.jsp?ftpID=1276) which was written for Microsoft Indexing Service by Adobe. The methods for this filter can be found on Microsoft's web site (http://msdn.microsoft.com/library/default.asp?url=/library/en-us/indexsrv/ixrefint_9sfm.asp).
I will provide the points to whoever can provide me with Delphi source code for using the IFilter. I'm looking to extract text from PDF files.
It must be the IFilter because later on I will be using other IFilters as well.
I will provide the points to whoever can provide me with Delphi source code for using the IFilter. I'm looking to extract text from PDF files.
It must be the IFilter because later on I will be using other IFilters as well.
listen
Dokken,
Not sure if you have the header files (ntquery/filter/filtererr)
At the bottom is a rough (little error checking, etc..) example of how to use the filter
--------------------------
unit ntquery;
interface
uses
Windows, SysUtils, ActiveX;
//
// Use this path for the null catalog, one that doesn't have an index.
// Use it to search for properties of files that are not indexed.
//
const
CINULLCATALOG = '::_noindex_::';
//
// Minimal support for persistent handlers.
//
function LoadIFilter(pwcsPath: PWideChar; pUnkOuter: IUnknown; var ppIUnk): HResult; stdcall external 'query.dll';
function BindIFilterFromStorage(pSt
function BindIFilterFromStream(pStm
function LocateCatalogsW(pwszScope:
//
// For calling from VB
//
function LocateCatalogsA(pwszScope:
// The Index Server Data Source Object CLSID
const
CLSID_INDEX_SERVER_DSO: TGUID = '{D7A2B01A-A47D-11D0-8C55-
// The storage property set
const
PSGUID_STORAGE: TGUID = '{B725F130-47EF-101A-A5F1-
//#define PID_STG_DICTIONARY ((PROPID) 0x00000000) //reserved
//#define PID_STG_CODEPAGE ((PROPID) 0x00000001) //reserved
const
PID_STG_DIRECTORY = $00000002;
PID_STG_CLASSID = $00000003;
PID_STG_STORAGETYPE = $00000004;
PID_STG_VOLUME_ID = $00000005;
PID_STG_PARENT_WORKID = $00000006;
// unused #define PID_STG_ ((PROPID) 0x00000007)
PID_STG_FILEINDEX = $00000008;
PID_STG_LASTCHANGEUSN = $00000009;
PID_STG_NAME = $0000000A;
PID_STG_PATH = $0000000B;
PID_STG_SIZE = $0000000C;
PID_STG_ATTRIBUTES = $0000000D;
PID_STG_WRITETIME = $0000000E;
PID_STG_CREATETIME = $0000000F;
PID_STG_ACCESSTIME = $00000010;
// unused #define PID_STG_ ((PROPID) 0x00000011)
PID_STG_ALLOCSIZE = $00000012;
PID_STG_CONTENTS = $00000013;
PID_STG_SHORTNAME = $00000014;
PID_STG_MAX = PID_STG_SHORTNAME;
CSTORAGEPROPERTY = $15;
// File System Content Index Framework property set
const
DBPROPSET_FSCIFRMWRK_EXT: TGUID = '{A9BD1526-6A80-11D0-8C9D-
const
DBPROP_CI_CATALOG_NAME = 2;
DBPROP_CI_INCLUDE_SCOPES = 3;
DBPROP_CI_DEPTHS = 4; // obsolete
DBPROP_CI_SCOPE_FLAGS = 4;
DBPROP_CI_EXCLUDE_SCOPES = 5;
DBPROP_CI_SECURITY_ID = 6;
DBPROP_CI_QUERY_TYPE = 7;
// Query Extension property set
const
DBPROPSET_QUERYEXT: TGUID = '{A7AC77ED-F8D7-11CE-A798-
const
DBPROP_USECONTENTINDEX = 2;
DBPROP_DEFERNONINDEXEDTRIM
DBPROP_USEEXTENDEDDBTYPES = 4;
// Content Index Framework Core property set
const
DBPROPSET_CIFRMWRKCORE_EXT
const
DBPROP_MACHINE = 2;
DBPROP_CLIENT_CLSID = 3;
// Scope flags
const
QUERY_SHALLOW = 0;
QUERY_DEEP = 1;
QUERY_PHYSICAL_PATH = 0;
QUERY_VIRTUAL_PATH = 2;
// query property set (PSGUID_QUERY) properties not defined in oledb.h
const
PROPID_QUERY_WORKID = 5;
PROPID_QUERY_UNFILTERED = 7;
PROPID_QUERY_VIRTUALPATH = 9;
PROPID_QUERY_LASTSEENTIME = 10;
implementation
end.
--------------------------
unit filter;
interface
uses
Windows, SysUtils, Classes, ActiveX;
type
IFILTER_INIT = TOleEnum;
const
IFILTER_INIT_CANON_PARAGRA
IFILTER_INIT_HARD_LINE_BRE
IFILTER_INIT_CANON_HYPHENS
IFILTER_INIT_CANON_SPACES = 8;
IFILTER_INIT_APPLY_INDEX_A
IFILTER_INIT_APPLY_OTHER_A
IFILTER_INIT_INDEXING_ONLY
IFILTER_INIT_SEARCH_LINKS = 128;
type
IFILTER_FLAGS = TOleEnum;
const
IFILTER_FLAGS_OLE_PROPERTI
type
CHUNKSTATE = TOleEnum;
const
CHUNK_TEXT = $01;
CHUNK_VALUE = $02;
type
CHUNK_BREAKTYPE = TOleEnum;
const
CHUNK_NO_BREAK = 0;
CHUNK_EOW = 1;
CHUNK_EOS = 2;
CHUNK_EOP = 3;
CHUNK_EOC = 4;
type
FILTERREGION = packed record
idChunk: ULONG;
cwcStart: ULONG;
cwcExtent: ULONG;
end;
tagFILTERREGION = FILTERREGION;
const
PRSPEC_LPWSTR = 0;
PRSPEC_PROPID = 1;
type
PROPID = ULONG;
type
PROPSPEC = packed record
ulKind: ULONG;
case integer of
0 : (prid: PROPID);
1 : (lpws: PWideChar);
end;
tagPROPSPEC = PROPSPEC;
type
FULLPROPSPEC = packed record
guidPropSet: TGUID;
psProperty: PROPSPEC;
end;
tagFULLPROPSPEC = FULLPROPSPEC;
PFULLPROPSPEC = ^FULLPROPSPEC;
type
STAT_CHUNK = packed record
idChunk: ULONG;
breakType: CHUNK_BREAKTYPE;
flags: CHUNKSTATE;
locale: LCID;
attribute: FULLPROPSPEC;
idChunkSource: ULONG;
cwcStartSource:ULONG;
cwcLenSource: ULONG;
end;
tagSTAT_CHUNK = STAT_CHUNK;
// From filtererr.h
#define FILTER_E_END_OF_CHUNKS ((HRESULT)0x80041700L)
//
// MessageId: FILTER_E_NO_MORE_TEXT
//
// MessageText:
//
// No more text available in chunk.
//
const
FILTER_E_NO_MORE_TEXT = $80041701;
//
// MessageId: FILTER_E_NO_MORE_VALUES
//
// MessageText:
//
// No more property values available in chunk.
//
const
FILTER_E_NO_MORE_VALUES = $80041702;
//
// MessageId: FILTER_E_ACCESS
//
// MessageText:
//
// Unable to access object.
//
const
FILTER_E_ACCESS = $80041703;
//
// MessageId: FILTER_W_MONIKER_CLIPPED
//
// MessageText:
//
// Moniker doesn't cover entire region.
//
const
FILTER_W_MONIKER_CLIPPED = $80041704;
//
// MessageId: FILTER_E_NO_TEXT
//
// MessageText:
//
// No text in current chunk.
//
const
FILTER_E_NO_TEXT = $80041705;
//
// MessageId: FILTER_E_NO_VALUES
//
// MessageText:
//
// No values in current chunk.
//
const
FILTER_E_NO_VALUES = $80041706;
//
// MessageId: FILTER_E_EMBEDDING_UNAVAIL
//
// MessageText:
//
// Unable to bind IFilter for embedded object.
//
const
FILTER_E_EMBEDDING_UNAVAIL
//
// MessageId: FILTER_E_LINK_UNAVAILABLE
//
// MessageText:
//
// Unable to bind IFilter for linked object.
//
const
FILTER_E_LINK_UNAVAILABLE = $80041708;
//
// MessageId: FILTER_S_LAST_TEXT
//
// MessageText:
//
// This is the last text in the current chunk.
//
const
FILTER_S_LAST_TEXT = $00041709;
//
// MessageId: FILTER_S_LAST_VALUES
//
// MessageText:
//
// This is the last value in the current chunk.
//
const
FILTER_S_LAST_VALUES = $0004170A;
//
// MessageId: FILTER_E_PASSWORD
//
// MessageText:
//
// File was not filtered due to password protection.
//
const
FILTER_E_PASSWORD = $8004170B;
//
// MessageId: FILTER_E_UNKNOWNFORMAT
//
// MessageText:
//
// The document format is not recognized by the flter.
//
const
FILTER_E_UNKNOWNFORMAT = $8004170C;
const
IID_IFIlter: TGUID = '{89BCB740-6119-101A-BCB7-
type
IFilter = interface(IUnknown)
['{89BCB740-6119-101A-BCB7
function Init(grfFlags: ULONG; cAttributes: ULONG; aAttributes: PFULLPROPSPEC; out pFlags: ULONG): HResult; stdcall;
function GetChunk(out pStat: STAT_CHUNK): HResult; stdcall;
function GetText(var pcwcBuffer: ULONG; awcBuffer: PWideChar): HResult; stdcall;
function GetValue(out ppPropValue: PROPVARIANT): HResult; stdcall;
function BindRegion(origPos: FILTERREGION; riid: TGUID; out ppUnk): HResult; stdcall;
end;
implementation
end.
--------------------------
Example:
procedure TForm1.Button1Click(Sender
var punk: IUnknown;
pfilt: IFilter;
chunk: STAT_CHUNK;
attr: FULLPROPSPEC;
pwc: PWideChar;
hret: HResult;
flags: ULONG;
c: Cardinal;
begin
pwc:=AllocMem(4096);
// Replace with your adobe file name
StringToWideChar('c:\adobe
if LoadIFilter(pwc, nil, punk) = S_OK then
begin
if punk.QueryInterface(IFilte
begin
hret:=pfilt.Init(IFILTER_I
if hret = S_OK then
begin
hret:=pfilt.GetChunk(chunk
if hret = S_OK then
begin
c:=2048;
while (hret = S_OK) do
begin
c:=2048;
hret:=pfilt.GetText(c, pwc);
end;
end;
end;
pfilt:=nil;
end;
punk:=nil;
end;
FreeMem(pwc);
end;
--------------------------
Hope this helps get you started
Russell
interested
Cool!
ASKER
Russell,
Were you able to test this using the Adobe PDF IFilter? I created a sample application and added the two units (ntquery and filter) and the code to the button event on the main form but when I change query.dll to the ifilter's path and file name (d:\pdffilt.dll) I get an access violation when the program loads.
Were you able to test this using the Adobe PDF IFilter? I created a sample application and added the two units (ntquery and filter) and the code to the button event on the main form but when I change query.dll to the ifilter's path and file name (d:\pdffilt.dll) I get an access violation when the program loads.
Dokken
1.) Yes I did test this.
2.) ??? Why are you changing query.dll ???
This is an MS DLL that exposes the LoadIFilter() function. This function is NOT exposed in the pdffilt.dll. The purpose of this is to allow you to pass a filename, and have MS do the work of finding the persistent handler associated with the file, then loading and returning the IFilter to you. The only thing that you would have needed to change in the code I gave you was the .pdf file that you wanted to filter on.
If I misunderstood what you changed, let me know (post the code) and we'll figure it out.
Russell
ASKER
Russell,
I misunderstood how you were doing this. I was looking to call the DLL directly. Will your way work under Win98? Does it require anything, like IIS server?
Also, how can I view the text it extracts?
I misunderstood how you were doing this. I was looking to call the DLL directly. Will your way work under Win98? Does it require anything, like IIS server?
Also, how can I view the text it extracts?
Sorry Dokken,
I gave you one way of doing it; which allows you to pass any file name, and it will find the associated filter for it. But, the query.dll is part of the MS indexing services, and requires NT/Win2K/WinXP. If you just want to deal with the PDF files (on all Win versions), then the following is what you want:
1.) Do not include the ntquery.h (because of the static bind to query.dll)
2.) Define the following const
const
PDFFilter: TGUID = '{4C904448-74A9-11d0-AF6E-
3.) Use the IPersistFile to load the PDF file:
Example:
--------------------------
procedure TForm1.Button1Click(Sender
var pfilt: IFilter;
ppfile: IPersistFile;
chunk: STAT_CHUNK;
pwc: PWideChar;
hret: HResult;
flags: ULONG;
c: Cardinal;
str: String;
begin
// Get the class instance for the pdf filter
hret:=CoCreateInstance(PDF
Assert(hret = S_OK, SysErrorMessage(hret));
// Get the IPersistFile handler
hret:=pfilt.QueryInterface
Assert(hret = S_OK, SysErrorMessage(hret));
// Allocate memory for wide char
pwc:=AllocMem(4096);
// Try...Finally so we can clean up
try
// Load the file
StringToWideChar('c:\adobe
hret:=ppfile.Load(pwc, STGM_READ);
Assert(hret = S_OK, SysErrorMessage(hret));
// Init the filter
hret:=pfilt.Init(IFILTER_I
Assert(hret = S_OK, SysErrorMessage(hret));
// Get the first chunck
hret:=pfilt.GetChunk(chunk
Assert(hret = S_OK, SysErrorMessage(hret));
// Get all the text within the chunk
while (hret = S_OK) do
begin
c:=2048;
hret:=pfilt.GetText(c, pwc);
if (hret = S_OK) or (hret = FILTER_S_LAST_TEXT) then
begin
str:=WideCharLenToString(p
ShowMessage(str);
end;
end;
finally
// Release the interfaces
ppfile:=nil;
pfilt:=nil;
// Free the allocated memory
FreeMem(pwc);
end;
end;
--------------------------
Pay note to the GetText function. The cardinal is set to the size of the buffer (note: size in WideChar = allocated buffer size div 2) on input, and on return will hold the number of chars returned. Use WideCharLenToString to get the value as string. What you do with the text (string) is up to you.
As far as a flow of operation goes :
1.) Get the interface
2.) Load the file
3.) Init the filter
4.) Call GetChunk()
5.) Call GetText until Chunk is exhausted
6.) Call GetChunk again, if no more chunks then done
7.) Clean up
The above example *should* (can't verify as I only have a 2000 box) work on Win95 on up.
Hope this clears it all up, if not, let me know :-)
Russell
ASKER
Russell,
I removed ntquery and added const PDFFilter: TGUID = '{4C904448-74A9-11d0-AF6E- 00C04FD8DC 02}'; but I'm unable to compile the test app. It gives "Undeclared indentifier" errors at ppfile: IPersistFile; and a few other locations.
I removed ntquery and added const PDFFilter: TGUID = '{4C904448-74A9-11d0-AF6E-
Dokken,
What version of Delphi are you compiling under? I just checked my D3 and D5 and both have IPersistFile defined in the ActiveX.pas unit. You shoud be including ActiveX as well as ComObj in this project. (As a general rule of thumb, most of your COM based projects will require the inclusion of these 2 units)
Russell
ASKER
Russell,
I'm using D5 so I must be doing something wrong. How do I add the ActiveX and ComObj?
I'm using D5 so I must be doing something wrong. How do I add the ActiveX and ComObj?
ASKER CERTIFIED SOLUTION
membership
This solution is only available to members.
To access this solution, you must be a member of Experts Exchange.
ASKER
It works... thanks. This solves a big problem.
The link to the IFilter page at MSDN does not exists any longer. Could you please update this link?
Thanks a lot.
Thanks a lot.
This is the new link to the IFilter interface page:
http://msdn.microsoft.com/en-us/library/ms691105%28VS.85%29.aspx
http://msdn.microsoft.com/en-us/library/ms691105%28VS.85%29.aspx