Link to home
Start Free TrialLog in
Avatar of hankknight
hankknightFlag for Canada

asked on

Remove bad words from array

I have two arrays.  One contains words taken from a book and one contains a list of unwanted words.

I would like to remove every word found in the second array from the first.
<pre><?php 
 
$text = "This is a test and only a test to demonstrate the ability of this script to identify and remove all the words that are too common to be used.";
 
preg_match_all('/[\'0-9\-\x41-\x5a\x5f\x61-\x7a\xc0-\xd6\xd8-\xf6\xf8-\xff]+/', $text, $words);
 
$frequency = array_count_values($words[0]);
 
$badWords = array(
    '1' => "add",
    '2' => "after",
    '3' => "again",
    '4' => "all",
    '5' => "almost",
    '6' => "also",
    '7' => "always",
    '8' => "an",
    '9' => "and",
    '10' => "any",
    '11' => "are",
    '12' => "as",
    '13' => "ask",
    '14' => "at",
    '15' => "back",
    '16' => "be",
    '17' => "because",
    '18' => "been",
    '19' => "before",
    '20' => "being",
    '21' => "between",
    '22' => "both",
    '23' => "but",
    '24' => "by",
    '25' => "ca",
    '26' => "cad",
    '27' => "cd",
    '28' => "call",
    '29' => "came",
    '30' => "can",
    '31' => "cause",
    '32' => "cange",
    '33' => "come",
    '34' => "could",
    '35' => "did",
    '36' => "differ",
    '37' => "do",
    '38' => "does",
    '39' => "doing",
    '40' => "don't",
    '41' => "dont",
    '42' => "don",
    '43' => "nt",
    '44' => "down",
    '45' => "done",
    '46' => "each",
    '47' => "else",
    '48' => "end",
    '49' => "ensure",
    '50' => "even",
    '51' => "every",
    '52' => "eye",
    '53' => "far",
    '54' => "find",
    '55' => "first",
    '56' => "food",
    '57' => "for",
    '58' => "found",
    '59' => "four",
    '60' => "from",
    '61' => "from",
    '62' => "get",
    '63' => "give",
    '64' => "go",
    '65' => "good",
    '66' => "great",
    '67' => "grow",
    '68' => "had",
    '69' => "hand",
    '70' => "hard",
    '71' => "has",
    '72' => "have",
    '73' => "he",
    '74' => "head",
    '75' => "help",
    '76' => "her",
    '77' => "her",
    '78' => "here",
    '79' => "high",
    '80' => "him",
    '81' => "his",
    '82' => "home",
    '83' => "hot",
    '84' => "house",
    '85' => "how",
    '86' => "however",
    '87' => "i",
    '88' => "im",
    '89' => "i'm",
    '90' => "i'll",
    '91' => "i'ld",
    '92' => "ill",
    '93' => "ll",
    '94' => "i'd",
    '95' => "if",
    '96' => "in",
    '97' => "include",
    '98' => "includes",
    '99' => "into",
    '100' => "is",
    '101' => "it",
    '102' => "its",
    '103' => "it's",
    '104' => "just",
    '105' => "keep",
    '106' => "kind",
    '107' => "know",
    '108' => "large",
    '109' => "last",
    '110' => "late",
    '111' => "later",
    '112' => "learn",
    '113' => "left",
    '114' => "let",
    '115' => "light",
    '116' => "like",
    '117' => "line",
    '118' => "little",
    '119' => "live",
    '120' => "long",
    '121' => "look",
    '122' => "lot",
    '123' => "low",
    '124' => "made",
    '125' => "make",
    '126' => "man",
    '127' => "many",
    '128' => "may",
    '129' => "me",
    '130' => "mean",
    '131' => "men",
    '132' => "might",
    '133' => "more",
    '134' => "most",
    '135' => "move",
    '136' => "much",
    '137' => "must",
    '138' => "my",
    '139' => "name",
    '140' => "near",
    '141' => "need",
    '142' => "needed",
    '143' => "new",
    '144' => "next",
    '145' => "no",
    '146' => "not",
    '147' => "now",
    '148' => "of",
    '149' => "off",
    '150' => "old",
    '151' => "on",
    '152' => "once",
    '153' => "one",
    '154' => "only",
    '155' => "or",
    '156' => "other",
    '157' => "our",
    '158' => "out",
    '159' => "overown",
    '160' => "page",
    '161' => "part",
    '162' => "per",
    '163' => "put",
    '164' => "quite",
    '165' => "re",
    '166' => "read",
    '167' => "really",
    '168' => "right",
    '169' => "round",
    '170' => "run",
    '171' => "said",
    '172' => "same",
    '173' => "saw",
    '174' => "say",
    '175' => "see",
    '176' => "self",
    '177' => "set",
    '178' => "she",
    '179' => "should",
    '180' => "show",
    '181' => "side",
    '182' => "since",
    '183' => "small",
    '184' => "so",
    '185' => "some",
    '186' => "sound",
    '187' => "stand",
    '188' => "start",
    '189' => "still",
    '190' => "such",
    '191' => "sun",
    '192' => "sure",
    '193' => "take",
    '194' => "tell",
    '195' => "test",
    '196' => "than",
    '197' => "that",
    '198' => "the",
    '199' => "their",
    '200' => "them",
    '201' => "then",
    '202' => "there",
    '203' => "these",
    '204' => "they",
    '205' => "thing",
    '206' => "things",
    '207' => "this",
    '208' => "three",
    '209' => "through",
    '210' => "to",
    '211' => "too",
    '212' => "try",
    '213' => "turn",
    '214' => "two",
    '215' => "under",
    '216' => "up",
    '217' => "us",
    '218' => "usd",
    '219' => "use",
    '220' => "very",
    '221' => "want",
    '222' => "was",
    '223' => "way",
    '224' => "we",
    '225' => "well",
    '226' => "went",
    '227' => "were",
    '228' => "what",
    '229' => "when",
    '230' => "where",
    '231' => "which",
    '232' => "while",
    '233' => "who",
    '234' => "why",
    '235' => "will",
    '236' => "willing",
    '237' => "with",
    '238' => "without",
    '239' => "would",
    '240' => "write",
    '241' => "you",
    '242' => "you'll",
    '243' => "youll",
    '244' => "you'd",
    '245' => "youd",
    '246' => "yould",
    '247' => "your",
    '248' => 'youre'
);
 
print_r($frequency);
 
?></pre>

Open in new window

Avatar of Guy Hengel [angelIII / a3]
Guy Hengel [angelIII / a3]
Flag of Luxembourg image

you should try to use the array_diff() function:
http://lu.php.net/manual/en/function.array-diff.php
$newarray = array_diff($bookwords, $badwords);

http://us3.php.net/manual/en/function.array-diff.php
Avatar of hankknight

ASKER

This does NOT work:
<pre><?php 
$text = "This is a test and only a test to demonstrate the ability of this script to identify and remove all the words that are too common to be used.";
preg_match_all('/[\'0-9\-\x41-\x5a\x5f\x61-\x7a\xc0-\xd6\xd8-\xf6\xf8-\xff]+/', $text, $words);
$frequency = array_count_values($words[0]);
$badWords = array(
    '1' => "add",
    '2' => "after",
    '3' => "again",
    '4' => "all",
    '5' => "almost",
    '6' => "also",
    '7' => "always",
    '8' => "an",
    '9' => "and",
    '10' => "any",
    '11' => "are",
    '12' => "as",
    '13' => "ask",
    '14' => "at",
    '15' => "back",
    '16' => "be",
    '17' => "because",
    '18' => "been",
    '19' => "before",
    '20' => "being",
    '21' => "between",
    '22' => "both",
    '23' => "but",
    '24' => "by",
    '25' => "ca",
    '26' => "cad",
    '27' => "cd",
    '28' => "call",
    '29' => "came",
    '30' => "can",
    '31' => "cause",
    '32' => "cange",
    '33' => "come",
    '34' => "could",
    '35' => "did",
    '36' => "differ",
    '37' => "do",
    '38' => "does",
    '39' => "doing",
    '40' => "don't",
    '41' => "dont",
    '42' => "don",
    '43' => "nt",
    '44' => "down",
    '45' => "done",
    '46' => "each",
    '47' => "else",
    '48' => "end",
    '49' => "ensure",
    '50' => "even",
    '51' => "every",
    '52' => "eye",
    '53' => "far",
    '54' => "find",
    '55' => "first",
    '56' => "food",
    '57' => "for",
    '58' => "found",
    '59' => "four",
    '60' => "from",
    '61' => "from",
    '62' => "get",
    '63' => "give",
    '64' => "go",
    '65' => "good",
    '66' => "great",
    '67' => "grow",
    '68' => "had",
    '69' => "hand",
    '70' => "hard",
    '71' => "has",
    '72' => "have",
    '73' => "he",
    '74' => "head",
    '75' => "help",
    '76' => "her",
    '77' => "her",
    '78' => "here",
    '79' => "high",
    '80' => "him",
    '81' => "his",
    '82' => "home",
    '83' => "hot",
    '84' => "house",
    '85' => "how",
    '86' => "however",
    '87' => "i",
    '88' => "im",
    '89' => "i'm",
    '90' => "i'll",
    '91' => "i'ld",
    '92' => "ill",
    '93' => "ll",
    '94' => "i'd",
    '95' => "if",
    '96' => "in",
    '97' => "include",
    '98' => "includes",
    '99' => "into",
    '100' => "is",
    '101' => "it",
    '102' => "its",
    '103' => "it's",
    '104' => "just",
    '105' => "keep",
    '106' => "kind",
    '107' => "know",
    '108' => "large",
    '109' => "last",
    '110' => "late",
    '111' => "later",
    '112' => "learn",
    '113' => "left",
    '114' => "let",
    '115' => "light",
    '116' => "like",
    '117' => "line",
    '118' => "little",
    '119' => "live",
    '120' => "long",
    '121' => "look",
    '122' => "lot",
    '123' => "low",
    '124' => "made",
    '125' => "make",
    '126' => "man",
    '127' => "many",
    '128' => "may",
    '129' => "me",
    '130' => "mean",
    '131' => "men",
    '132' => "might",
    '133' => "more",
    '134' => "most",
    '135' => "move",
    '136' => "much",
    '137' => "must",
    '138' => "my",
    '139' => "name",
    '140' => "near",
    '141' => "need",
    '142' => "needed",
    '143' => "new",
    '144' => "next",
    '145' => "no",
    '146' => "not",
    '147' => "now",
    '148' => "of",
    '149' => "off",
    '150' => "old",
    '151' => "on",
    '152' => "once",
    '153' => "one",
    '154' => "only",
    '155' => "or",
    '156' => "other",
    '157' => "our",
    '158' => "out",
    '159' => "overown",
    '160' => "page",
    '161' => "part",
    '162' => "per",
    '163' => "put",
    '164' => "quite",
    '165' => "re",
    '166' => "read",
    '167' => "really",
    '168' => "right",
    '169' => "round",
    '170' => "run",
    '171' => "said",
    '172' => "same",
    '173' => "saw",
    '174' => "say",
    '175' => "see",
    '176' => "self",
    '177' => "set",
    '178' => "she",
    '179' => "should",
    '180' => "show",
    '181' => "side",
    '182' => "since",
    '183' => "small",
    '184' => "so",
    '185' => "some",
    '186' => "sound",
    '187' => "stand",
    '188' => "start",
    '189' => "still",
    '190' => "such",
    '191' => "sun",
    '192' => "sure",
    '193' => "take",
    '194' => "tell",
    '195' => "test",
    '196' => "than",
    '197' => "that",
    '198' => "the",
    '199' => "their",
    '200' => "them",
    '201' => "then",
    '202' => "there",
    '203' => "these",
    '204' => "they",
    '205' => "thing",
    '206' => "things",
    '207' => "this",
    '208' => "three",
    '209' => "through",
    '210' => "to",
    '211' => "too",
    '212' => "try",
    '213' => "turn",
    '214' => "two",
    '215' => "under",
    '216' => "up",
    '217' => "us",
    '218' => "usd",
    '219' => "use",
    '220' => "very",
    '221' => "want",
    '222' => "was",
    '223' => "way",
    '224' => "we",
    '225' => "well",
    '226' => "went",
    '227' => "were",
    '228' => "what",
    '229' => "when",
    '230' => "where",
    '231' => "which",
    '232' => "while",
    '233' => "who",
    '234' => "why",
    '235' => "will",
    '236' => "willing",
    '237' => "with",
    '238' => "without",
    '239' => "would",
    '240' => "write",
    '241' => "you",
    '242' => "you'll",
    '243' => "youll",
    '244' => "you'd",
    '245' => "youd",
    '246' => "yould",
    '247' => "your",
    '248' => 'youre'
);
 
$newarray = array_diff($frequency, $badWords);
 
print_r($newarray);
 
?></pre>

Open in new window

ASKER CERTIFIED SOLUTION
Avatar of Cornelia Yoder
Cornelia Yoder
Flag of United States of America image

Link to home
membership
This solution is only available to members.
To access this solution, you must be a member of Experts Exchange.
Start Free Trial
SOLUTION
Link to home
membership
This solution is only available to members.
To access this solution, you must be a member of Experts Exchange.
Start Free Trial