<?php





/*

 *

 * implements a Paice/Husk Stemmer written in PHP by Alexis Ulrich (http://alx2002.free.fr)

 *

 * Tool kit

 *

 * This code is in the public domain.

 *

 */



// sometimes, it's too long...



set_time_limit

(

0

);



require_once(

'PaiceHuskStemmer.php'

);





// punctuation characters



$punctuation 

= array(

'.'

, 

','

, 

';'

, 

':'

, 

'!'

, 

'?'

, 

'"'

, 

'\''

, 

'('

, 

')'

, 

'--'

);









/*

 * standardized punctuation: each punctuation mark has a space before and after it

 *

 *     $text:    string, the text to be processed

 *    $lang:    language of the text (default: English)

 */



function 

standardizePunctuation

(

$text

, 

$lang

=

'en'

) {

    

// puts a space before and after a punctuation mark, 

    // whatever the number of spaces there were before and after it

    

$text 

= 

preg_replace

(

'/( )*(["\'\.,;:\(\)\?!])( )*/'

, 

' \\2 '

, 

$text

);

    

// whitespace

    

$text 

= 

preg_replace

(

'/\s/'

, 

' '

, 

$text

);

    if (

$lang 

== 

'en'

) {

        

// handles the didn't, couldn't...

        

$text 

= 

str_replace

(

'n \' t'

, 

'n\'t'

, 

$text

);

        

// handles the o'clock

        

$text 

= 

str_replace

(

'o \' clock'

, 

'o\'clock'

, 

$text

);

    }

    return 

$text

;

}







/*

 * uses typographic rules to return a well-written string from a standardized-punctuated one

 *

 *     $text:    string, the text to be processed

 *    $lang:    language of the text (default: English)

 */



function 

localizePunctuation

(

$text

, 

$lang

=

'en'

) {

    if (

$lang 

== 

'en'

) {

        

$patterns 

= array(

                    

'/ " /'

,            

# keeps the space before opening double-quote

                    

'/ "$/'

,            

# removes space before ending string double-quote

                    

'/( )(")([\.,;:\)!\?])/'

,    

# removes spaces before non-ending string double-quote

                    

'/ \' /'

,            

# keeps the space before opening simple-quote

                    

'/ \'s /'

,            

# removes the space before an 's (like in "it's")

                    

'/ \'$/'

,            

# removes space before ending string simple-quote

                    

'/( )(\')([\.,;:\)!\?])/'

,    

# removes spaces before non-ending string simple-quote

                    

'/s \'/'

,            

# handles five minutes' walk and Thomas' car

                    

'/s<\/([a-z]+)> \'/i'

,        

# the same with </xyz> closing tag

                    

'/ ([\.,;:\)!\?])/'

,        

# .,;:)!? without space before

                    

'/\( /'

,            

# no space after (

                    

'/ \. \. \./'            

# ...

                    

);

        

$replace 

= array(

                    

' "'

,                

# keeps the space before opening double-quote

                    

'"'

,                

# removes space before ending string double-quote

                    

'"\\3'

,                

# removes spaces before non-ending string double-quote

                    

' \''

,                

# keeps the space before opening simple-quote

                    

'\'s '

,                

# removes the space before an 's (like in "it's")

                    

'\''

,                

# removes space before ending string simple-quote

                    

'\'\\3'

,            

# removes spaces before non-ending string simple-quote

                    

's\' '

,                

# handles five minutes' walk and Thomas' car

                    

's</\\1>\' '

,            

# the same with </xyz> closing tag

                    

'\\1'

,                

# .,;:)!? without space before

                    

'('

,                

# no space after (

                    

'...'                

# ...

                    

);

    }

    else {

        

$patterns 

= array();

        

$replace 

= array();

    }

    

$text 

= 

preg_replace

(

$patterns

, 

$replace

, 

$text

);

    return 

$text

;

}







/*

 * indexes the given text and returns an array of three arrays:

 *    - 'original': the original text

 *    - 'modified': the modified text, ie the standardized-punctuation form

 *    - 'index': an array of three-element arrays:

 *            - 'form': the form of the word in the original text

 *            - 'index': the index of the form in the modified text

 *            - 'stem': the stem of the form

 *

 *     $text:        string, the text to be processed

 *    $lang:        language of the text (default: English)

 */



function 

indexText

(

$text

, 

$lang

=

'en'

) {

    global 

$punctuation

;

    require_once(

'stoplist_'

.

$lang

.

'.inc.php'

);

    

$indexArray 

= array();

    

$thisText 

= 

standardizePunctuation

(

$text

, 

$lang

);

    

$thisTextWords 

= 

explode

(

' '

,

$thisText

);

    

$thisTextIndex 

= array();

    

$wordIndex 

= 

0

;

    for (

$i

=

0

; 

$i

<

sizeOf

(

$thisTextWords

); 

$i

++) {

        

$form 

= 

$thisTextWords

[

$i

];

        

$word 

= 

strtolower

(

$form

);

        

// words which length is 1 or 0 are not processed.

        

if ((!@

in_array

(

$word

, 

$punctuation

)) && (

strlen

(

$word

) > 

1

) && (!@

in_array

(

$word

, 

$stoplist

))) {

            

$thisTextIndex

[] = array(

'form'

=>

$form

, 

'stem'

=>

PaiceHuskStemmer

(

$word

,

$lang

), 

'index'

=>

$wordIndex

);

        }

        

$wordIndex 

= 

$wordIndex 

+ 

strlen

(

$word

) + 

1

; 

// the last space

    

}

    return array(

'original'

=>

$text

, 

'modified'

=>

$thisText

, 

'index'

=>

$thisTextIndex

);

}







/*

 *    displays some statistics for the Paice/Husk stemmer (default language: English)

 *

 *     $texts:        array of texts

 *    $stem_lang:    language of the texts the given stemmer can handle (default: English)

 *    $if_lang:    language of the interface (default: English)

 *    $precision:    number of digits after the decimal point (default: 1)

 */



function 

getStatistics

(

$texts

, 

$stem_lang

=

'en'

, 

$if_lang

=

'en'

, 

$precision

=

2

) {

    global 

$punctuation

;

    

$total_sample_words 

= 

0

;

    

$total_number_stems 

= 

0

;

    

$total_length 

= 

0

;

    

$max_length 

= 

0

;

    

$min_length 

= 

50

;

    

$unique_words 

= array();

    

$unique_stems 

= array();

    

$nb_changed_words 

= 

0

;

    

$nb_removed_characters 

= 

0

;

    

$nb_removed_characters_distribution 

= array();

    if (

$if_lang 

== 

'fr'

) {

        

$vocab

[

'statistics_of_Paice_Husk_Stemmer'

] = 

'Statistiques du stemmer Paice/Husk.'

;

        

$vocab

[

'Paice_Husk_Stemmer'

] = 

'Stemmer Paice/Husk'

;

        

$vocab

[

'sample_size'

] = 

'Taille de l\'&eacute;chantillon&nbsp;:'

;

        

$vocab

[

'words'

] = 

'mots'

;

        

$vocab

[

'dispatched_in'

] = 

'r&eacute;partis en'

;

        

$vocab

[

'text_units'

] = 

'unit&eacute;s textuelles'

;

        

$vocab

[

'number_of_unique_words'

] = 

'Nombre de mots uniques&nbsp;:'

;

        

$vocab

[

'number_of_stems_found'

] = 

'Nombre de racines&nbsp;:'

;

        

$vocab

[

'number_of_unique_stems_found'

] = 

'Nombre de racines uniques&nbsp;:'

;

        

$vocab

[

'min_max_value_of_found_stems_length'

] = 

'Longueur min/max des racines&nbsp;:'

;

        

$vocab

[

'mean_value_of_found_stems_length'

] = 

'Longueur moyenne des racines&nbsp;:'

;

        

$vocab

[

'number_of_words_per_conflation_class'

] = 

'Nombre de mots par classe de conflation&nbsp;:'

;

        

$vocab

[

'index_compression'

] = 

'Indice de compression&nbsp;:'

;

        

$vocab

[

'word_change_factor'

] = 

'Facteur de changement par mot&nbsp;:'

;

        

$vocab

[

'number_of_characters_removed'

] = 

'Nombre de caract&egrave;res supprim&eacute;s&nbsp;:'

;

        

$vocab

[

'mean_removal_rate'

] = 

'Taux moyen de suppression&nbsp;:'

;

        

$vocab

[

'median_removal_rate'

] = 

'Taux m&eacute;dian de suppression&nbsp;:'

;

    }

    else {

        

// if ($if_lang == 'en') {

        

$vocab

[

'statistics_of_Paice_Husk_Stemmer'

] = 

'Statistics of the Paice/Husk stemmer.'

;

        

$vocab

[

'Paice_Husk_Stemmer'

] = 

'Paice/Husk stemmer'

;

        

$vocab

[

'sample_size'

] = 

'Size of the sample:'

;

        

$vocab

[

'words'

] = 

'words'

;

        

$vocab

[

'dispatched_in'

] = 

'dispatched in'

;

        

$vocab

[

'text_units'

] = 

'text units'

;

        

$vocab

[

'number_of_unique_words'

] = 

'Number of unique words:'

;

        

$vocab

[

'number_of_stems_found'

] = 

'Number of stems found:'

;

        

$vocab

[

'number_of_unique_stems_found'

] = 

'Number of unique stems found:'

;

        

$vocab

[

'min_max_value_of_found_stems_length'

] = 

'Min/Max value of found stems length:'

;

        

$vocab

[

'mean_value_of_found_stems_length'

] = 

'Mean value of found stems length:'

;

        

$vocab

[

'number_of_words_per_conflation_class'

] = 

'Number of words per conflation class:'

;

        

$vocab

[

'index_compression'

] = 

'Index Compression:'

;

        

$vocab

[

'word_change_factor'

] = 

'Word Change Factor:'

;

        

$vocab

[

'number_of_characters_removed'

] = 

'Number of characters removed:'

;

        

$vocab

[

'mean_removal_rate'

] = 

'Mean removal rate:'

;

        

$vocab

[

'median_removal_rate'

] = 

'Median removal rate:'

;

    }

    

    for (

$i

=

0

; 

$i

<

sizeOf

(

$texts

); 

$i

++) {

        

$total_sample_words 

+= 

str_word_count

(

$texts

[

$i

]);

        

$textsArray 

= 

indexText

(

$texts

[

$i

], 

$stem_lang

);

        

$modified_text 

= 

$textsArray

[

'modified'

];

        

$modified_words 

= 

explode

(

' '

,

$modified_text

);

        foreach (

$modified_words 

as 

$word

)

            if ((!

in_array

(

$word

,

$unique_words

)) && (

$word 

!= 

''

) && (!

in_array

(

$word

, 

$punctuation

)))

                

$unique_words

[] = 

$word

;

        

        

$total_number_stems 

+= 

sizeOf

(

$textsArray

[

'index'

]);

        foreach (

$textsArray

[

'index'

] as 

$stems

) {

            

$stem_length 

= 

strlen

(

$stems

[

'stem'

]);

            

$form_length 

= 

strlen

(

strtolower

(

$stems

[

'form'

]));

            if (!

in_array

(

$stems

[

'stem'

],

$unique_stems

))

                

$unique_stems

[] = 

$stems

[

'stem'

];

            if (

$stems

[

'stem'

] != 

strtolower

(

$stems

[

'form'

]))

                

$nb_changed_words

++;

            

$nb_removed_characters 

+= 

$form_length 

- 

$stem_length

;

            

$nb_removed_characters_distribution

[] = 

$form_length 

- 

$stem_length

;



            

$total_length 

+= 

$stem_length

;

            if (

$stem_length 

< 

$min_length

) 

$min_length 

= 

$stem_length

;

            if (

$stem_length 

> 

$max_length

) 

$max_length 

= 

$stem_length

;

        }

    }

    

$total_sample_unique_words 

= 

sizeOf

(

$unique_words

);

    

$total_number_unique_stems 

= 

sizeOf

(

$unique_stems

);

    

    

sort

(

$nb_removed_characters_distribution

);

    

$sizeOf_nb_removed_characters_distribution 

= 

sizeOf

(

$nb_removed_characters_distribution

);

    if (

$sizeOf_nb_removed_characters_distribution

%

2 

== 

0

)

        

$median_removal_rate 

= 

0.5 

* (

$nb_removed_characters_distribution

[

$sizeOf_nb_removed_characters_distribution

/

2

] + 

$nb_removed_characters_distribution

[

1

+(

$sizeOf_nb_removed_characters_distribution

/

2

)]);

    else 

$median_removal_rate 

= 

$nb_removed_characters_distribution

[(

$sizeOf_nb_removed_characters_distribution

+

1

)/

2

];



    echo 

"<b>

${

vocab

[

'statistics_of_Paice_Husk_Stemmer'

]}

</b><br><br>

\n

"

;

    echo 

$vocab

[

'sample_size'

].

' '

.

$total_sample_words

.

' '

.

$vocab

[

'words'

].

' ('

.

$vocab

[

'dispatched_in'

].

' '

.

sizeOf

(

$texts

).

' '

.

$vocab

[

'text_units'

].

")<br>\n"

;

    echo 

$vocab

[

'number_of_unique_words'

].

" $total_sample_unique_words<br>

\n

"

;

    echo 

"<br>\n"

;

    echo 

'<table border="1" rules="group">'

.

"\n"

;

    echo 

'<thead><th></th><th>&nbsp;'

.

$vocab

[

'Paice_Husk_Stemmer'

].

"&nbsp;</th></thead>\n"

;

    echo 

"<tfoot></tfoot>\n"

;

    

    echo 

'<tbody><tr><td align="right">'

.

$vocab

[

'number_of_stems_found'

].

"&nbsp;</td>\n"

;

    echo 

"<td align=

\"

center

\"

>$total_number_unique_stems</td>

\n

"

;

    echo 

"</tr>\n"

;

    echo 

"<tr>\n"

;

    echo 

'<td align="right">'

.

$vocab

[

'number_of_unique_stems_found'

].

"&nbsp;</td>\n"

;

    echo 

"<td align=

\"

center

\"

>$total_number_unique_stems</td>

\n

"

;

    echo 

"</tr></tbody>\n"

;

    

    echo 

'<tbody><tr><td align="right">'

.

$vocab

[

'min_max_value_of_found_stems_length'

].

"&nbsp;</td>\n"

;

    echo 

"<td align=

\"

center

\"

>$min_length / $max_length</td>

\n

"

;

    echo 

"</tr>"

;

    echo 

'<tr><td align="right">'

.

$vocab

[

'mean_value_of_found_stems_length'

].

"&nbsp;</td>"

;

    echo 

'<td align="center">'

.

round

((

$total_length

/

$total_number_stems

),

$precision

).

"</td>\n"

;

    echo 

"</tr></tbody>\n"

;



    echo 

'<tbody><tr><td align="right">'

.

$vocab

[

'number_of_words_per_conflation_class'

].

"&nbsp;</td>\n"

;

    echo 

'<td align="center">'

.

round

(

$total_sample_unique_words

/

$total_number_unique_stems

,

$precision

).

"</td>\n"

;

    echo 

'<tr><td align="right">'

.

$vocab

[

'index_compression'

].

"&nbsp;</td>\n"

;

    echo 

'<td align="center">'

.

round

((

$total_sample_unique_words 

- 

$total_number_unique_stems

)/

$total_sample_unique_words

,

$precision

).

"</td>\n"

;

    echo 

'<tr><td align="right">'

.

$vocab

[

'word_change_factor'

].

"&nbsp;</td>\n"

;

    echo 

'<td align="center">'

.

round

(

$nb_changed_words

/

$total_sample_words

,

$precision

).

"</td>\n"

;

    echo 

"</tr></tbody>\n"

;



    echo 

'<tbody><tr><td align="right">'

.

$vocab

[

'number_of_characters_removed'

].

"&nbsp;</td>\n"

;

    echo 

"<td align=

\"

center

\"

>$nb_removed_characters</td>

\n

"

;

    echo 

'<tr><td align="right">'

.

$vocab

[

'mean_removal_rate'

].

"&nbsp;</td>\n"

;

    echo 

'<td align="center">'

.

round

(

$nb_removed_characters

/

$total_number_stems

,

$precision

).

"</td>\n"

;

    echo 

'<tr><td align="right">'

.

$vocab

[

'median_removal_rate'

].

"&nbsp;</td>\n"

;

    echo 

"<td align=

\"

center

\"

>$median_removal_rate</td>

\n

"

;

    echo 

"</tr></tbody>\n"

;



    echo 

"</table>\n"

;

}







/*

 * lists the most common suffixes of a corpus in the given $language

 * which are not listed in the matching words2stems_<language>.inc.php file

 *

 * $language:        text language (default: English)

 * $checkWords2Stems:    boolean (default value: False)

 *            if True, lists only the words which are not listed in the matching words2stems_<language>.inc.php file

 */



function 

listSuffixes

(

$language

=

'en'

, 

$checkWords2Stems 

= 

False

) {

    

// opens the file

    

$corpus_handle 

= @

fopen

(

'corpus_'

.

$language

.

'.txt'

,

'r'

);

    if (

$checkWords2Stems

) require_once(

'words2stems_'

.

$language

.

'.inc.php'

);

    if (!

$corpus_handle

)

        die(

'Failed to open the file corpus_'

.

$language

.

'.txt'

);

        

    

$total_words 

= 

0

;

    

$words 

= array();

    

$line 

= 

trim

(

fgets

(

$corpus_handle

));

    while (!

feof

(

$corpus_handle

)) {

        

// removes the punctuation

        

if ((

$line 

!= 

''

) && (!

preg_match

(

'/^p[0-9]/'

,

$line

))) {

            

$line 

= 

preg_replace

(

'/( )*(["\'\.,;:\(\)\?!])( )*/'

, 

' '

, 

$line

);

            

            

// extract the words

            

$thoseWords 

= 

explode

(

' '

,

$line

);

            if (!

preg_match

(

'/[0-9]+/'

,

$word

))

                

$total_words 

+= 

sizeOf

(

$thoseWords

);

            

            

// creates an array of reversed words as key and their number of occurrences as value

            // if they have a length of at least 4 characters

            

foreach(

$thoseWords 

as 

$word

) {

                

$word 

= 

strrev

(

strtolower

(

trim

(

$word

)));

                if ((

strlen

(

$word

) > 

3

) && (!

preg_match

(

'/[0-9]+/'

,

$word

))) {

                    if (!

array_key_exists

(

$word

,

$words

)) 

$words

[

$word

] = 

1

;

                    else 

$words

[

$word

]++;

                }

            }

        }

        

$line 

= 

trim

(

fgets

(

$corpus_handle

));

    }

    

    

// computes their frequency (in percentage of their number of occurrences)

    // such as x% of the words in the corpus have the form...

    

foreach(

$words 

as 

$word 

=> 

$occ

) {

        

$words

[

$word

] = 

round

(

$occ

/

$total_words

,

7

)*

100

;

    }

    

    

// sorts them by frequency

    

arsort

(

$words

);

    

    

// removes those having a frequency less than 0.0001 %

    

while (

array_pop

(

$words

) < 

0.0001

);



    if (

$checkWords2Stems

) {

        

// removes the words already handled by 'words2stems_<language>.inc.php'

        // rewrites $words2stems keys utf8-encoded

        

$new_words2stems 

= array();

        foreach(

$words2stems 

as 

$word 

=> 

$stem

)

            

$new_words2stems

[

utf8_decode

(

$word

)] = 

$stem

;

        

$words2stems 

= 

$new_words2stems

;

        unset(

$new_words2stems

);

        

        

$new_words 

= array();

        foreach(

$words 

as 

$word 

=> 

$freq

) {

            if (!

array_key_exists

(

strrev

(

$word

), 

$words2stems

))

                

$new_words

[

$word

] = 

$freq

;

        }

        

$words 

= 

$new_words

;

        unset(

$new_words

);

    }

    

    

$words_per_frequency 

= array();

    foreach(

$words 

as 

$word 

=> 

$freq

) {

        if (!

in_array

(

$freq

, 

$words_per_frequency

)) {

            

$words_per_frequency

[

"$freq"

][] = 

$word

;

        }

    }

    unset(

$words

);

    

    

// writes them out in a text file

    

$stems_handle 

= @

fopen

(

'corpus_stems_'

.

$language

.

'.txt'

,

'w'

);

    if (!

$stems_handle

)

        die(

'Failed to open the file corpus_stems_'

.

$language

.

'.txt'

);

    foreach(

$words_per_frequency 

as 

$frequency 

=> 

$words

) {

        

// sorts them by inverse suffixes

        

sort

(

$words

);

        if (!

fwrite

(

$stems_handle

,

"\nfrequency: "

.

$frequency

.

"\n---------\n"

))

            die(

'Failed to write in the file corpus_stems_'

.

$language

.

'.txt'

);

        foreach (

$words 

as 

$word

) {

            

$reversed 

= 

strrev

(

$word

);

            if (!

fwrite

(

$stems_handle

,

$word

.

"\t("

.

$reversed

.

")\n"

))

                die(

'Failed to write in the file corpus_stems_'

.

$language

.

'.txt'

);

        }

    }

        

    

// closes the files

    

fclose

(

$corpus_handle

);

    

fclose

(

$stems_handle

);

    

    

// displays ok message

    

echo 

"Corpus size: $total_words forms.<br>"

;

    echo 

'The stems of the file <b>corpus_'

.

$language

.

'.txt</b> have been saved in <b>corpus_stems_'

.

$language

.

'.txt</b>.'

;

}







/*

 * displays for each word of the words2stems array the word and the returned stem

 * after having processed it with the new set of rules if the returned stem is 

 * different from the expected one.

 *

 * $language:    text language (default: French)

 * $if_lang:    language of the interface (default: English)

 */ 



function 

developPaiceHusk

(

$language

=

'fr'

, 

$if_lang

=

'en'

) {

    require_once(

'words2stems_'

.

$language

.

'.inc.php'

);

    if (

$if_lang 

== 

'fr'

) {

        

$vocab

[

'troubles'

] = 

'Des probl&egrave;mes sont encore pr&eacute;sents avec les racines suivantes...'

;

        

$vocab

[

'instead_of'

] = 

'au lieu de'

;

        

$vocab

[

'exact_match'

] = 

'Les racines trouv&eacute;es correspondent &agrave; celles list&eacute;es dans le fichier'

;

        

$vocab

[

'file'

] = 

''

;

    }

    else {

        

// if ($if_lang == 'en') {

        

$vocab

[

'troubles'

] = 

'There\'s still some troubles with the following stems...'

;

        

$vocab

[

'instead_of'

] = 

'instead of'

;

        

$vocab

[

'exact_match'

] = 

'The stems found match exactly the ones listed in the'

;

        

$vocab

[

'file'

] = 

' file'

;

    }

    

    

$cpt 

= 

0

;

    foreach(

$words2stems 

as 

$word 

=> 

$stem

) {

        

$stemmer_stem 

= 

PaiceHuskStemmer

(

$word

,

$language

);

        if (

$stem 

!= 

$stemmer_stem

) {

            if (

$cpt 

== 

0

) echo 

'<br><b>'

,

$vocab

[

'troubles'

].

'</b><br><blockquote>'

;

            

$cpt

++;

            echo 

"$cpt. $word => <i>$stemmer_stem</i> "

.

$vocab

[

'instead_of'

].

" <i>$stem</i><br>"

;

        }

    }

    if (

$cpt 

!= 

0

) echo 

'</blockquote>'

;

    else echo 

'<br><b>'

.

$vocab

[

'exact_match'

].

' <i>words2stems_'

.

$language

.

'</i>'

.

$vocab

[

'file'

].

'.</b>'

;

}





?>