Keyword Density

Hi

I'm looking for some code for calculating the keyword density of a text file.

Thanks
zattzAsked:
Who is Participating?

[Webinar] Streamline your web hosting managementRegister Today

x
 
Pierre CorneliusConnect With a Mentor Commented:
Here's a starting point for you:

PAS File:
=================================================================
unit Unit1;

interface

uses
  Windows, Messages, SysUtils, Variants, Classes, Graphics, Controls, Forms,
  Dialogs, XPMan, StdCtrls, Grids;

type
  TKeyWordRec = record
    KeyWord: string;
    Count: integer;
  end;
  TKeyWordRecs = array of TKeyWordRec;

  TWordsArray = array of string;

  TForm1 = class(TForm)
    Label1: TLabel;
    inpFilename: TEdit;
    btnSelectFile: TButton;
    dlgOpen: TOpenDialog;
    Button1: TButton;
    StringGrid1: TStringGrid;
    procedure Button1Click(Sender: TObject);
    procedure btnSelectFileClick(Sender: TObject);
    procedure FormDestroy(Sender: TObject);
    procedure FormCreate(Sender: TObject);
  private
    KeyWords: TKeyWordRecs;
    procedure GetWordsAndCount;
    procedure UpdateGrid;
    function IndexOfKeyWord(kw: string):integer;
  public
    { Public declarations }
  end;

var
  Form1: TForm1;

implementation

{$R *.dfm}

procedure SortKeyWords(var A: TKeyWordRecs; Ascending: boolean = true);

  procedure qs(var A: TKeyWordRecs; iLo, iHi: Integer);
  var
    Lo, Hi: Integer;
    MID, T: TKeyWordRec;
  begin
    Lo := iLo;
    Hi := iHi;
    Mid := A[(Lo + Hi) div 2];
    repeat
      if Ascending then
      begin
        while A[Lo].Count < Mid.Count do Inc(Lo);
        while A[Hi].Count > Mid.Count do Dec(Hi);
      end
      else begin
        while A[Lo].Count > Mid.Count do Inc(Lo);
        while A[Hi].Count < Mid.Count do Dec(Hi);
      end;
      if Lo <= Hi then
      begin
        T := A[Lo];
        A[Lo] := A[Hi];
        A[Hi] := T;
        Inc(Lo);
        Dec(Hi);
      end;
    until Lo > Hi;

    if Hi > iLo then qs(A, iLo, Hi);
    if Lo < iHi then qs(A, Lo, iHi);
  end;

begin
  qs(A, Low(A), High(A));
end;

function TForm1.IndexOfKeyWord(kw: string):integer;
var i: integer;
begin
  result:= -1;
  for i:= low(KeyWords) to High(KeyWords) do
  begin
    if KeyWords[i].KeyWord = kw then
    begin
      result:= i;
      exit;
    end;
  end;
end;

procedure TForm1.Button1Click(Sender: TObject);
begin
  SetLength(KeyWords, 0);
  GetWordsAndCount;
end;

procedure TForm1.GetWordsAndCount;
const AcceptedChars: set of char = ['a'..'z', 'A'..'Z','-',''''];
    function SeparateWords(AString: string): TWordsArray;
    var p1,p2: pchar;
    begin
      if AString = '' then exit;
      p1:= pchar(AString);
      p2:= p1;
      repeat
        Inc(p2);
        While NOT (p1^ in AcceptedChars) and (p1^ <> #0)
          do Inc(p1);
        if NOT (p2^ in AcceptedChars) AND (p2 > p1) then
        begin
          SetLength(result, length(result)+1);
          result[length(result)-1]:= p1;
          SetLength(result[length(result)-1], p2-p1);
          While NOT (p2^ in AcceptedChars) and (p2^ <> #0)
            do Inc(p2);
          p1:= p2;
        end;
      until p2^ = #0;
    end;

var sl: TStringList;
    s: string;
    Words: TWordsArray;
    i, w, kwi: integer;
begin
  if NOT FileExists(inpFilename.Text)
    then raise exception.Create(Format('Error: File not found(%s)', [inpFilename.Text]));

  sl:= TStringList.Create;
  try
    sl.LoadFromFile(inpFilename.Text);
    for i:= 0 to sl.Count-1 do
    begin
      Words:= SeparateWords(sl[i]);
      for w:= low(Words) to High(Words) do
      begin
        kwi:= IndexOfKeyWord(Words[w]);
        if kwi = -1 then
        begin
          SetLength(KeyWords, Length(KeyWords)+1);
          KeyWords[Length(KeyWords)-1].KeyWord:= Words[w];
          KeyWords[Length(KeyWords)-1].Count:= 1;
        end
        else KeyWords[kwi].Count:= KeyWords[kwi].Count +1;
      end;
      SetLength(Words, 0);
    end;

    SortKeyWords(KeyWords, false);
    UpdateGrid;
  finally
    sl.Free;
  end;
end;

procedure TForm1.btnSelectFileClick(Sender: TObject);
begin
  if dlgOpen.Execute
    then inpFilename.Text:= dlgOpen.FileName;
end;

procedure TForm1.FormDestroy(Sender: TObject);
begin
  SetLength(KeyWords, 0);
end;

procedure TForm1.UpdateGrid;
var r,c, TotalWords: integer;
begin
  StringGrid1.Cells[0,0]:= 'Word';
  StringGrid1.Cells[1,0]:= 'Count';
  StringGrid1.Cells[2,0]:= 'Density';
  StringGrid1.RowCount:= Length(KeyWords)+1;

  TotalWords:= 0;
  for r:= low(KeyWords) to High(KeyWords)
    do TotalWords:= TotalWords + Keywords[r].Count;

  for r:= low(KeyWords) to High(KeyWords) do
  begin
    StringGrid1.Cells[0,r+1]:= KeyWords[r].KeyWord;
    StringGrid1.Cells[1,r+1]:= IntToStr(KeyWords[r].Count);
    StringGrid1.Cells[2,r+1]:= FormatFloat('0.00%', KeyWords[r].Count / TotalWords *100);
  end;
end;

procedure TForm1.FormCreate(Sender: TObject);
begin
  StringGrid1.ColWidths[0]:= 200;
  StringGrid1.ColWidths[1]:= 80;
  StringGrid1.ColWidths[2]:= 80;
end;

end.


DFM File:
=================================================================
object Form1: TForm1
  Left = 192
  Top = 114
  Width = 696
  Height = 480
  Caption = 'Get keyword summary'
  Color = clBtnFace
  Font.Charset = DEFAULT_CHARSET
  Font.Color = clWindowText
  Font.Height = -11
  Font.Name = 'MS Sans Serif'
  Font.Style = []
  OldCreateOrder = False
  OnCreate = FormCreate
  OnDestroy = FormDestroy
  DesignSize = (
    688
    446)
  PixelsPerInch = 96
  TextHeight = 13
  object Label1: TLabel
    Left = 8
    Top = 8
    Width = 42
    Height = 13
    Caption = 'Filename'
  end
  object inpFilename: TEdit
    Left = 64
    Top = 8
    Width = 585
    Height = 21
    Anchors = [akLeft, akTop, akRight]
    TabOrder = 0
  end
  object btnSelectFile: TButton
    Left = 656
    Top = 8
    Width = 20
    Height = 20
    Anchors = [akTop, akRight]
    Caption = '…'
    Font.Charset = ANSI_CHARSET
    Font.Color = clWindowText
    Font.Height = -11
    Font.Name = 'Times New Roman'
    Font.Style = [fsBold]
    ParentFont = False
    TabOrder = 1
    OnClick = btnSelectFileClick
  end
  object Button1: TButton
    Left = 8
    Top = 40
    Width = 121
    Height = 25
    Caption = 'Get words and count'
    TabOrder = 2
    OnClick = Button1Click
  end
  object StringGrid1: TStringGrid
    Left = 8
    Top = 72
    Width = 481
    Height = 353
    ColCount = 3
    DefaultColWidth = 200
    FixedCols = 0
    RowCount = 2
    TabOrder = 3
  end
  object dlgOpen: TOpenDialog
    Filter = 'Text files (*.txt)|*.txt'
    Left = 208
    Top = 8
  end
end


Kind regards
Pierre
0
 
TheRealLokiSenior DeveloperCommented:
can you explain this a little more? I'm unfamiliar with the term
0
 
mikelittlewoodCommented:
Do you mean how many times ALL words appear in a file, or how many times a word you supply appears in a file?
0
Get your problem seen by more experts

Be seen. Boost your question’s priority for more expert views and faster solutions

 
zattzAuthor Commented:
The measure of how many times a word is repeated compared to the overall content. For example, if a word was listed 5 times out of a word count of 100, the keyword density for that word is 5%.

I want the keyword density for all words in a text file, sorted by highest density descending.

Thanks
0
 
TNameCommented:
And here's another one ;)

2 memos and a button on a form, paste some text into Memo1 and press the button
The function Explode() is taken from here:
http://www.swissdelphicenter.ch/torry/showcode.php?id=1326



unit Unit1;

interface

uses
  Windows, Messages, SysUtils, Variants, Classes, Graphics, Controls, Forms,
  Dialogs, StdCtrls;

type
  TExplodeArray = array of String;

type
  TWordRec=Record
    S:String;
    Count:Integer;
  end;

type
  TForm1 = class(TForm)
    Button1: TButton;
    Memo1: TMemo;
    Memo2: TMemo;
    procedure Button1Click(Sender: TObject);
  private
    WArray: array of TWordRec;
    procedure WordOccurence(Text: string);
    procedure SortWA;
  public
  end;
var
  Form1: TForm1;
  function Explode(const cSeparator, vString: String): TExplodeArray;
implementation

{$R *.dfm}


function Explode(const cSeparator, vString: String): TExplodeArray;
var
  i: Integer;
  S: String;
begin
  S := vString;
  SetLength(Result, 0);
  i := 0;
  while Pos(cSeparator, S) > 0 do begin
    SetLength(Result, Length(Result) +1);
    Result[i] := Copy(S, 1, Pos(cSeparator, S) -1);
    Inc(i);
    S := Copy(S, Pos(cSeparator, S) + Length(cSeparator), Length(S));
  end;
  SetLength(Result, Length(Result) +1);
  Result[i] := Copy(S, 1, Length(S));
end;


procedure TForm1.WordOccurence(Text: string);
var
  i,j:Integer;
  TextWordList: TExplodeArray;
  AlreadyThere:Boolean;
  tempS:String;
begin
  TextWordList := Explode(' ', Text);
  for i := 0 to Length(TextWordList)-1 do begin
    AlreadyThere:=False;
    tempS:=Trim(TextWordList[i]);
    //tempS:=UpperCase(tempS);      //only if no case sensitivity required
    if not (tempS[Length(tempS)] in ['a'..'z','A'..'Z']) then
      Delete(tempS,Length(tempS),1);

    if length(tempS)>1 then
       if not (tempS[1] in ['a'..'z','A'..'Z']) then
          Delete(tempS,1,1);

    if Length(WArray)>0 then begin
      for j := 0 to Length(WArray)-1 do begin
        if WArray[j].S = tempS then begin
          inc(WArray[j].Count);
          AlreadyThere:=True;
        end;
      end;
    end;
    if not AlreadyThere then begin
      SetLength(WArray,Length(WArray)+1);
      WArray[High(WArray)].S:=tempS;
      WArray[High(WArray)].Count:=1;
    end;
  end;
end;

procedure TForm1.Button1Click(Sender: TObject);
var
i:Integer;
begin
 Memo2.Clear;
 WordOccurence(Memo1.Text);
 SortWA;
 for i:=0 to Length(Warray)-1 do begin
   Memo2.Lines.Add(Warray[i].S+' = '+IntToStr(Warray[i].Count));
 end;
 SetLength(WArray,0);
end;


procedure TForm1.SortWA;
var
 ok: boolean;
 i, n: integer;
 temp: TWordRec;
begin
 n := Length(WArray);
 repeat
   ok := true;
   for i := 0 to n - 2 do
     if WArray[i].Count < WArray[i+1].Count then begin
       temp := WArray[i];
       WArray[i] := WArray[i+1];
       WArray[i+1] := temp;
       ok := false;
     end;
 until ok;
end;


end.
0
 
TNameCommented:
BTW, as you probably don't care about case sensitivity, it would be probably better to re-activate this line in the WordOccurence procedure (delete the first "//")
//tempS:=UpperCase(tempS);      //only if no case sensitivity required

and replace this
Memo2.Lines.Add(Warray[i].S+' = '+IntToStr(Warray[i].Count));
with this (in the Button1's OnClick)
Memo2.Lines.Add(LowerCase(Warray[i].S)+' = '+IntToStr(Warray[i].Count));
0
 
zattzAuthor Commented:
Thanks Pierre, thats an awesome example.

TName, yours didnt work
0
 
Pierre CorneliusCommented:
Glad to help
0
 
TNameCommented:
>TName, yours didnt work

Would you mind specifying?
Just curious, as I've re-copied the code as it is into delphi and ran it without problems.
0
 
zattzAuthor Commented:
I can't remember. Let me copy it back into delphi and I'll let you know just now
0
 
zattzAuthor Commented:
Input:

dog
dog
cat cat
cow
moo
horse

output:

dog
dog
cat = 1
cat
cow
moo
horse = 1

----------------------------------------------------------------------------------------------------

What I was looking for was

dog 2, 28%
cat 2, 28%
cow, 1, 14%
moo, 1, 14%
horse, 1, 14%


:)


0
 
TNameCommented:
You're right, I somehow forgot about the %...
0
All Courses

From novice to tech pro — start learning today.