<?php
/*
*
* implements a Paice/Husk Stemmer written in PHP by Alexis Ulrich (http://alx2002.free.fr)
*
* This code is in the public domain.
*
*/
// the rule patterns include all accented forms for a given language
$rule_pattern_en
=
"/^([a-z]*)(\*){0,1}(\d)([a-z]*)([.|>])/"
;
$rule_pattern_fr
=
"/^([a-zàâèéêëîïôûùç]*)(\*){0,1}(\d)([a-zàâèéêëîïôûùç]*)([.|>])/"
;
// returns the number of the first rule from the rule number $rule_number
// that can be applied to the given reversed form
// returns -1 if no rule can be applied, ie the stem has been found
function
getFirstRule
(
$reversed_form
,
$rule_number
,
$language
=
'en'
) {
require(
'PaiceHuskStemRules_'
.
$language
.
'.php'
);
eval(
"global
\$rule_pattern_$language;"
);
eval(
"\$PaiceHuskStemmerRules = \$PaiceHuskStemmerRules_$language;" );
$nb_rules
=
sizeOf
(
$PaiceHuskStemmerRules
);
for (
$i
=
$rule_number
;
$i
<
$nb_rules
;
$i
++) {
// gets the letters from the current rule
$rule
=
$PaiceHuskStemmerRules
[
$i
];
eval(
"
\$rule = preg_replace(
\$rule_pattern_$language,
\"\\\\
1
\"
,
\$rule);"
);
if (
strncasecmp
(
utf8_decode
(
$rule
),
$reversed_form
,
strlen
(
utf8_decode
(
$rule
))) ==
0
) return
$i
;
}
return -
1
;
}
/*
* Check the acceptability of a stem for a given language
*
* $reversed_stem: the stem to check in reverse form
* $language: text language (default: French)
*/
function
checkAcceptability
(
$reversed_stem
,
$language
=
'en'
) {
switch (
$language
) {
case
'en'
:
# English
if (
preg_match
(
"/
[
aeiouy
]$
/"
,
utf8_encode
(
$reversed_stem
))) {
// if the form starts with a vowel then at least two letters must remain after stemming (e.g., "owed"/"owing" --> "ow", but not "ear" --> "e")
return (
strlen
(
$reversed_stem
) >=
2
);
}
else {
// if the form starts with a consonant then at least three letters must remain after stemming
if (
strlen
(
$reversed_stem
) <
3
) return
False
;
// and at least one of these must be a vowel or "y" (e.g., "saying" --> "say" and "crying" --> "cry", but not "string" --> "str", "meant" --> "me" or "cement" --> "ce")
return (
preg_match
(
"/[aeiouy]/"
,
utf8_encode
(
$reversed_stem
)));
}
break;
case
'fr'
:
# French
if (
preg_match
(
"/
[
a??e????i??o?u??y
]$
/"
,
utf8_encode
(
$reversed_stem
))) {
// if the form starts with a vowel then at least two letters must remain after stemming (e.g.: "?taient" --> "?t")
return (
strlen
(
$reversed_stem
) >
2
);
}
else {
// if the form starts with a consonant then at least two letters must remain after stemming
if (
strlen
(
$reversed_stem
) <=
2
) {
return
False
;
}
// and at least one of these must be a vowel or "y"
return (
preg_match
(
"/[a??e????i??o?u??y]/"
,
utf8_encode
(
$reversed_stem
)));
}
break;
break;
default:
die(
"Error in checkAcceptability function: the language <i>$language</i> is not supported."
);
}
}
/*
* the actual Paice/Husk stemmer
* which returns a stem for the given form
*
* $form: the word for which we want the stem
* $language: the word language (default: French)
*/
function
PaiceHuskStemmer
(
$form
,
$language
=
'en'
) {
require(
'PaiceHuskStemRules_'
.
$language
.
'.php'
);
eval(
"global
\$rule_pattern_$language;"
);
eval(
"\$PaiceHuskStemmerRules = \$PaiceHuskStemmerRules_$language;" );
$intact
=
True
;
$stem_found
=
False
;
$reversed_form
=
strrev
(
utf8_decode
(
$form
));
$rule_number
=
0
;
// that loop goes through the rules' array until it finds an ending one (ending by '.') or the last one ('end0.')
while (
True
) {
$rule_number
=
getFirstRule
(
$reversed_form
,
$rule_number
,
$language
);
if (
$rule_number
== -
1
) {
// no other rule can be applied => the stem has been found
break;
}
$rule
=
$PaiceHuskStemmerRules
[
$rule_number
];
eval(
"preg_match(
\$rule_pattern_$language,
\$rule,
\$matches);"
);
if ((
$matches
[
2
] !=
'*'
) || (
$intact
)) {
$reversed_stem
=
utf8_decode
(
$matches
[
4
]) .
substr
(
$reversed_form
,
$matches
[
3
],
strlen
(
$reversed_form
)-
$matches
[
3
]);
if (
checkAcceptability
(
$reversed_stem
,
$language
)) {
$reversed_form
=
$reversed_stem
;
if (
$matches
[
5
] ==
'.'
) break;
}
else {
// go to another rule
$rule_number
++;
}
}
else {
// go to another rule
$rule_number
++;
}
}
return
utf8_encode
(
strrev
(
$reversed_form
));
}
?>