<?php
/*
*
* implements a Paice/Husk Stemmer written in PHP by Alexis Ulrich (http://alx2002.free.fr)
*
* Tool kit
*
* This code is in the public domain.
*
*/
// sometimes, it's too long...
set_time_limit
(
0
);
require_once(
'PaiceHuskStemmer.php'
);
// punctuation characters
$punctuation
= array(
'.'
,
','
,
';'
,
':'
,
'!'
,
'?'
,
'"'
,
'\''
,
'('
,
')'
,
'--'
);
/*
* standardized punctuation: each punctuation mark has a space before and after it
*
* $text: string, the text to be processed
* $lang: language of the text (default: English)
*/
function
standardizePunctuation
(
$text
,
$lang
=
'en'
) {
// puts a space before and after a punctuation mark,
// whatever the number of spaces there were before and after it
$text
=
preg_replace
(
'/( )*(["\'\.,;:\(\)\?!])( )*/'
,
' \\2 '
,
$text
);
// whitespace
$text
=
preg_replace
(
'/\s/'
,
' '
,
$text
);
if (
$lang
==
'en'
) {
// handles the didn't, couldn't...
$text
=
str_replace
(
'n \' t'
,
'n\'t'
,
$text
);
// handles the o'clock
$text
=
str_replace
(
'o \' clock'
,
'o\'clock'
,
$text
);
}
return
$text
;
}
/*
* uses typographic rules to return a well-written string from a standardized-punctuated one
*
* $text: string, the text to be processed
* $lang: language of the text (default: English)
*/
function
localizePunctuation
(
$text
,
$lang
=
'en'
) {
if (
$lang
==
'en'
) {
$patterns
= array(
'/ " /'
,
# keeps the space before opening double-quote
'/ "$/'
,
# removes space before ending string double-quote
'/( )(")([\.,;:\)!\?])/'
,
# removes spaces before non-ending string double-quote
'/ \' /'
,
# keeps the space before opening simple-quote
'/ \'s /'
,
# removes the space before an 's (like in "it's")
'/ \'$/'
,
# removes space before ending string simple-quote
'/( )(\')([\.,;:\)!\?])/'
,
# removes spaces before non-ending string simple-quote
'/s \'/'
,
# handles five minutes' walk and Thomas' car
'/s<\/([a-z]+)> \'/i'
,
# the same with </xyz> closing tag
'/ ([\.,;:\)!\?])/'
,
# .,;:)!? without space before
'/\( /'
,
# no space after (
'/ \. \. \./'
# ...
);
$replace
= array(
' "'
,
# keeps the space before opening double-quote
'"'
,
# removes space before ending string double-quote
'"\\3'
,
# removes spaces before non-ending string double-quote
' \''
,
# keeps the space before opening simple-quote
'\'s '
,
# removes the space before an 's (like in "it's")
'\''
,
# removes space before ending string simple-quote
'\'\\3'
,
# removes spaces before non-ending string simple-quote
's\' '
,
# handles five minutes' walk and Thomas' car
's</\\1>\' '
,
# the same with </xyz> closing tag
'\\1'
,
# .,;:)!? without space before
'('
,
# no space after (
'...'
# ...
);
}
else {
$patterns
= array();
$replace
= array();
}
$text
=
preg_replace
(
$patterns
,
$replace
,
$text
);
return
$text
;
}
/*
* indexes the given text and returns an array of three arrays:
* - 'original': the original text
* - 'modified': the modified text, ie the standardized-punctuation form
* - 'index': an array of three-element arrays:
* - 'form': the form of the word in the original text
* - 'index': the index of the form in the modified text
* - 'stem': the stem of the form
*
* $text: string, the text to be processed
* $lang: language of the text (default: English)
*/
function
indexText
(
$text
,
$lang
=
'en'
) {
global
$punctuation
;
require_once(
'stoplist_'
.
$lang
.
'.inc.php'
);
$indexArray
= array();
$thisText
=
standardizePunctuation
(
$text
,
$lang
);
$thisTextWords
=
explode
(
' '
,
$thisText
);
$thisTextIndex
= array();
$wordIndex
=
0
;
for (
$i
=
0
;
$i
<
sizeOf
(
$thisTextWords
);
$i
++) {
$form
=
$thisTextWords
[
$i
];
$word
=
strtolower
(
$form
);
// words which length is 1 or 0 are not processed.
if ((!@
in_array
(
$word
,
$punctuation
)) && (
strlen
(
$word
) >
1
) && (!@
in_array
(
$word
,
$stoplist
))) {
$thisTextIndex
[] = array(
'form'
=>
$form
,
'stem'
=>
PaiceHuskStemmer
(
$word
,
$lang
),
'index'
=>
$wordIndex
);
}
$wordIndex
=
$wordIndex
+
strlen
(
$word
) +
1
;
// the last space
}
return array(
'original'
=>
$text
,
'modified'
=>
$thisText
,
'index'
=>
$thisTextIndex
);
}
/*
* displays some statistics for the Paice/Husk stemmer (default language: English)
*
* $texts: array of texts
* $stem_lang: language of the texts the given stemmer can handle (default: English)
* $if_lang: language of the interface (default: English)
* $precision: number of digits after the decimal point (default: 1)
*/
function
getStatistics
(
$texts
,
$stem_lang
=
'en'
,
$if_lang
=
'en'
,
$precision
=
2
) {
global
$punctuation
;
$total_sample_words
=
0
;
$total_number_stems
=
0
;
$total_length
=
0
;
$max_length
=
0
;
$min_length
=
50
;
$unique_words
= array();
$unique_stems
= array();
$nb_changed_words
=
0
;
$nb_removed_characters
=
0
;
$nb_removed_characters_distribution
= array();
if (
$if_lang
==
'fr'
) {
$vocab
[
'statistics_of_Paice_Husk_Stemmer'
] =
'Statistiques du stemmer Paice/Husk.'
;
$vocab
[
'Paice_Husk_Stemmer'
] =
'Stemmer Paice/Husk'
;
$vocab
[
'sample_size'
] =
'Taille de l\'échantillon :'
;
$vocab
[
'words'
] =
'mots'
;
$vocab
[
'dispatched_in'
] =
'répartis en'
;
$vocab
[
'text_units'
] =
'unités textuelles'
;
$vocab
[
'number_of_unique_words'
] =
'Nombre de mots uniques :'
;
$vocab
[
'number_of_stems_found'
] =
'Nombre de racines :'
;
$vocab
[
'number_of_unique_stems_found'
] =
'Nombre de racines uniques :'
;
$vocab
[
'min_max_value_of_found_stems_length'
] =
'Longueur min/max des racines :'
;
$vocab
[
'mean_value_of_found_stems_length'
] =
'Longueur moyenne des racines :'
;
$vocab
[
'number_of_words_per_conflation_class'
] =
'Nombre de mots par classe de conflation :'
;
$vocab
[
'index_compression'
] =
'Indice de compression :'
;
$vocab
[
'word_change_factor'
] =
'Facteur de changement par mot :'
;
$vocab
[
'number_of_characters_removed'
] =
'Nombre de caractères supprimés :'
;
$vocab
[
'mean_removal_rate'
] =
'Taux moyen de suppression :'
;
$vocab
[
'median_removal_rate'
] =
'Taux médian de suppression :'
;
}
else {
// if ($if_lang == 'en') {
$vocab
[
'statistics_of_Paice_Husk_Stemmer'
] =
'Statistics of the Paice/Husk stemmer.'
;
$vocab
[
'Paice_Husk_Stemmer'
] =
'Paice/Husk stemmer'
;
$vocab
[
'sample_size'
] =
'Size of the sample:'
;
$vocab
[
'words'
] =
'words'
;
$vocab
[
'dispatched_in'
] =
'dispatched in'
;
$vocab
[
'text_units'
] =
'text units'
;
$vocab
[
'number_of_unique_words'
] =
'Number of unique words:'
;
$vocab
[
'number_of_stems_found'
] =
'Number of stems found:'
;
$vocab
[
'number_of_unique_stems_found'
] =
'Number of unique stems found:'
;
$vocab
[
'min_max_value_of_found_stems_length'
] =
'Min/Max value of found stems length:'
;
$vocab
[
'mean_value_of_found_stems_length'
] =
'Mean value of found stems length:'
;
$vocab
[
'number_of_words_per_conflation_class'
] =
'Number of words per conflation class:'
;
$vocab
[
'index_compression'
] =
'Index Compression:'
;
$vocab
[
'word_change_factor'
] =
'Word Change Factor:'
;
$vocab
[
'number_of_characters_removed'
] =
'Number of characters removed:'
;
$vocab
[
'mean_removal_rate'
] =
'Mean removal rate:'
;
$vocab
[
'median_removal_rate'
] =
'Median removal rate:'
;
}
for (
$i
=
0
;
$i
<
sizeOf
(
$texts
);
$i
++) {
$total_sample_words
+=
str_word_count
(
$texts
[
$i
]);
$textsArray
=
indexText
(
$texts
[
$i
],
$stem_lang
);
$modified_text
=
$textsArray
[
'modified'
];
$modified_words
=
explode
(
' '
,
$modified_text
);
foreach (
$modified_words
as
$word
)
if ((!
in_array
(
$word
,
$unique_words
)) && (
$word
!=
''
) && (!
in_array
(
$word
,
$punctuation
)))
$unique_words
[] =
$word
;
$total_number_stems
+=
sizeOf
(
$textsArray
[
'index'
]);
foreach (
$textsArray
[
'index'
] as
$stems
) {
$stem_length
=
strlen
(
$stems
[
'stem'
]);
$form_length
=
strlen
(
strtolower
(
$stems
[
'form'
]));
if (!
in_array
(
$stems
[
'stem'
],
$unique_stems
))
$unique_stems
[] =
$stems
[
'stem'
];
if (
$stems
[
'stem'
] !=
strtolower
(
$stems
[
'form'
]))
$nb_changed_words
++;
$nb_removed_characters
+=
$form_length
-
$stem_length
;
$nb_removed_characters_distribution
[] =
$form_length
-
$stem_length
;
$total_length
+=
$stem_length
;
if (
$stem_length
<
$min_length
)
$min_length
=
$stem_length
;
if (
$stem_length
>
$max_length
)
$max_length
=
$stem_length
;
}
}
$total_sample_unique_words
=
sizeOf
(
$unique_words
);
$total_number_unique_stems
=
sizeOf
(
$unique_stems
);
sort
(
$nb_removed_characters_distribution
);
$sizeOf_nb_removed_characters_distribution
=
sizeOf
(
$nb_removed_characters_distribution
);
if (
$sizeOf_nb_removed_characters_distribution
%
2
==
0
)
$median_removal_rate
=
0.5
* (
$nb_removed_characters_distribution
[
$sizeOf_nb_removed_characters_distribution
/
2
] +
$nb_removed_characters_distribution
[
1
+(
$sizeOf_nb_removed_characters_distribution
/
2
)]);
else
$median_removal_rate
=
$nb_removed_characters_distribution
[(
$sizeOf_nb_removed_characters_distribution
+
1
)/
2
];
echo
"<b>
${
vocab
[
'statistics_of_Paice_Husk_Stemmer'
]}
</b><br><br>
\n
"
;
echo
$vocab
[
'sample_size'
].
' '
.
$total_sample_words
.
' '
.
$vocab
[
'words'
].
' ('
.
$vocab
[
'dispatched_in'
].
' '
.
sizeOf
(
$texts
).
' '
.
$vocab
[
'text_units'
].
")<br>\n"
;
echo
$vocab
[
'number_of_unique_words'
].
" $total_sample_unique_words<br>
\n
"
;
echo
"<br>\n"
;
echo
'<table border="1" rules="group">'
.
"\n"
;
echo
'<thead><th></th><th> '
.
$vocab
[
'Paice_Husk_Stemmer'
].
" </th></thead>\n"
;
echo
"<tfoot></tfoot>\n"
;
echo
'<tbody><tr><td align="right">'
.
$vocab
[
'number_of_stems_found'
].
" </td>\n"
;
echo
"<td align=
\"
center
\"
>$total_number_unique_stems</td>
\n
"
;
echo
"</tr>\n"
;
echo
"<tr>\n"
;
echo
'<td align="right">'
.
$vocab
[
'number_of_unique_stems_found'
].
" </td>\n"
;
echo
"<td align=
\"
center
\"
>$total_number_unique_stems</td>
\n
"
;
echo
"</tr></tbody>\n"
;
echo
'<tbody><tr><td align="right">'
.
$vocab
[
'min_max_value_of_found_stems_length'
].
" </td>\n"
;
echo
"<td align=
\"
center
\"
>$min_length / $max_length</td>
\n
"
;
echo
"</tr>"
;
echo
'<tr><td align="right">'
.
$vocab
[
'mean_value_of_found_stems_length'
].
" </td>"
;
echo
'<td align="center">'
.
round
((
$total_length
/
$total_number_stems
),
$precision
).
"</td>\n"
;
echo
"</tr></tbody>\n"
;
echo
'<tbody><tr><td align="right">'
.
$vocab
[
'number_of_words_per_conflation_class'
].
" </td>\n"
;
echo
'<td align="center">'
.
round
(
$total_sample_unique_words
/
$total_number_unique_stems
,
$precision
).
"</td>\n"
;
echo
'<tr><td align="right">'
.
$vocab
[
'index_compression'
].
" </td>\n"
;
echo
'<td align="center">'
.
round
((
$total_sample_unique_words
-
$total_number_unique_stems
)/
$total_sample_unique_words
,
$precision
).
"</td>\n"
;
echo
'<tr><td align="right">'
.
$vocab
[
'word_change_factor'
].
" </td>\n"
;
echo
'<td align="center">'
.
round
(
$nb_changed_words
/
$total_sample_words
,
$precision
).
"</td>\n"
;
echo
"</tr></tbody>\n"
;
echo
'<tbody><tr><td align="right">'
.
$vocab
[
'number_of_characters_removed'
].
" </td>\n"
;
echo
"<td align=
\"
center
\"
>$nb_removed_characters</td>
\n
"
;
echo
'<tr><td align="right">'
.
$vocab
[
'mean_removal_rate'
].
" </td>\n"
;
echo
'<td align="center">'
.
round
(
$nb_removed_characters
/
$total_number_stems
,
$precision
).
"</td>\n"
;
echo
'<tr><td align="right">'
.
$vocab
[
'median_removal_rate'
].
" </td>\n"
;
echo
"<td align=
\"
center
\"
>$median_removal_rate</td>
\n
"
;
echo
"</tr></tbody>\n"
;
echo
"</table>\n"
;
}
/*
* lists the most common suffixes of a corpus in the given $language
* which are not listed in the matching words2stems_<language>.inc.php file
*
* $language: text language (default: English)
* $checkWords2Stems: boolean (default value: False)
* if True, lists only the words which are not listed in the matching words2stems_<language>.inc.php file
*/
function
listSuffixes
(
$language
=
'en'
,
$checkWords2Stems
=
False
) {
// opens the file
$corpus_handle
= @
fopen
(
'corpus_'
.
$language
.
'.txt'
,
'r'
);
if (
$checkWords2Stems
) require_once(
'words2stems_'
.
$language
.
'.inc.php'
);
if (!
$corpus_handle
)
die(
'Failed to open the file corpus_'
.
$language
.
'.txt'
);
$total_words
=
0
;
$words
= array();
$line
=
trim
(
fgets
(
$corpus_handle
));
while (!
feof
(
$corpus_handle
)) {
// removes the punctuation
if ((
$line
!=
''
) && (!
preg_match
(
'/^p[0-9]/'
,
$line
))) {
$line
=
preg_replace
(
'/( )*(["\'\.,;:\(\)\?!])( )*/'
,
' '
,
$line
);
// extract the words
$thoseWords
=
explode
(
' '
,
$line
);
if (!
preg_match
(
'/[0-9]+/'
,
$word
))
$total_words
+=
sizeOf
(
$thoseWords
);
// creates an array of reversed words as key and their number of occurrences as value
// if they have a length of at least 4 characters
foreach(
$thoseWords
as
$word
) {
$word
=
strrev
(
strtolower
(
trim
(
$word
)));
if ((
strlen
(
$word
) >
3
) && (!
preg_match
(
'/[0-9]+/'
,
$word
))) {
if (!
array_key_exists
(
$word
,
$words
))
$words
[
$word
] =
1
;
else
$words
[
$word
]++;
}
}
}
$line
=
trim
(
fgets
(
$corpus_handle
));
}
// computes their frequency (in percentage of their number of occurrences)
// such as x% of the words in the corpus have the form...
foreach(
$words
as
$word
=>
$occ
) {
$words
[
$word
] =
round
(
$occ
/
$total_words
,
7
)*
100
;
}
// sorts them by frequency
arsort
(
$words
);
// removes those having a frequency less than 0.0001 %
while (
array_pop
(
$words
) <
0.0001
);
if (
$checkWords2Stems
) {
// removes the words already handled by 'words2stems_<language>.inc.php'
// rewrites $words2stems keys utf8-encoded
$new_words2stems
= array();
foreach(
$words2stems
as
$word
=>
$stem
)
$new_words2stems
[
utf8_decode
(
$word
)] =
$stem
;
$words2stems
=
$new_words2stems
;
unset(
$new_words2stems
);
$new_words
= array();
foreach(
$words
as
$word
=>
$freq
) {
if (!
array_key_exists
(
strrev
(
$word
),
$words2stems
))
$new_words
[
$word
] =
$freq
;
}
$words
=
$new_words
;
unset(
$new_words
);
}
$words_per_frequency
= array();
foreach(
$words
as
$word
=>
$freq
) {
if (!
in_array
(
$freq
,
$words_per_frequency
)) {
$words_per_frequency
[
"$freq"
][] =
$word
;
}
}
unset(
$words
);
// writes them out in a text file
$stems_handle
= @
fopen
(
'corpus_stems_'
.
$language
.
'.txt'
,
'w'
);
if (!
$stems_handle
)
die(
'Failed to open the file corpus_stems_'
.
$language
.
'.txt'
);
foreach(
$words_per_frequency
as
$frequency
=>
$words
) {
// sorts them by inverse suffixes
sort
(
$words
);
if (!
fwrite
(
$stems_handle
,
"\nfrequency: "
.
$frequency
.
"\n---------\n"
))
die(
'Failed to write in the file corpus_stems_'
.
$language
.
'.txt'
);
foreach (
$words
as
$word
) {
$reversed
=
strrev
(
$word
);
if (!
fwrite
(
$stems_handle
,
$word
.
"\t("
.
$reversed
.
")\n"
))
die(
'Failed to write in the file corpus_stems_'
.
$language
.
'.txt'
);
}
}
// closes the files
fclose
(
$corpus_handle
);
fclose
(
$stems_handle
);
// displays ok message
echo
"Corpus size: $total_words forms.<br>"
;
echo
'The stems of the file <b>corpus_'
.
$language
.
'.txt</b> have been saved in <b>corpus_stems_'
.
$language
.
'.txt</b>.'
;
}
/*
* displays for each word of the words2stems array the word and the returned stem
* after having processed it with the new set of rules if the returned stem is
* different from the expected one.
*
* $language: text language (default: French)
* $if_lang: language of the interface (default: English)
*/
function
developPaiceHusk
(
$language
=
'fr'
,
$if_lang
=
'en'
) {
require_once(
'words2stems_'
.
$language
.
'.inc.php'
);
if (
$if_lang
==
'fr'
) {
$vocab
[
'troubles'
] =
'Des problèmes sont encore présents avec les racines suivantes...'
;
$vocab
[
'instead_of'
] =
'au lieu de'
;
$vocab
[
'exact_match'
] =
'Les racines trouvées correspondent à celles listées dans le fichier'
;
$vocab
[
'file'
] =
''
;
}
else {
// if ($if_lang == 'en') {
$vocab
[
'troubles'
] =
'There\'s still some troubles with the following stems...'
;
$vocab
[
'instead_of'
] =
'instead of'
;
$vocab
[
'exact_match'
] =
'The stems found match exactly the ones listed in the'
;
$vocab
[
'file'
] =
' file'
;
}
$cpt
=
0
;
foreach(
$words2stems
as
$word
=>
$stem
) {
$stemmer_stem
=
PaiceHuskStemmer
(
$word
,
$language
);
if (
$stem
!=
$stemmer_stem
) {
if (
$cpt
==
0
) echo
'<br><b>'
,
$vocab
[
'troubles'
].
'</b><br><blockquote>'
;
$cpt
++;
echo
"$cpt. $word => <i>$stemmer_stem</i> "
.
$vocab
[
'instead_of'
].
" <i>$stem</i><br>"
;
}
}
if (
$cpt
!=
0
) echo
'</blockquote>'
;
else echo
'<br><b>'
.
$vocab
[
'exact_match'
].
' <i>words2stems_'
.
$language
.
'</i>'
.
$vocab
[
'file'
].
'.</b>'
;
}
?>