| /* |
| * Affix stripping stemming algorithm for Tamil |
| * By Damodharan Rajalingam |
| */ |
| |
| stringescapes {} |
| |
| /* Aytham */ |
| stringdef aytham hex '0B83' |
| |
| /* Uyir - independent vowels */ |
| stringdef a hex '0B85' |
| stringdef aa hex '0B86' |
| stringdef i hex '0B87' |
| stringdef ii hex '0B88' |
| stringdef u hex '0B89' |
| stringdef uu hex '0B8A' |
| stringdef e hex '0B8E' |
| stringdef ee hex '0B8F' |
| stringdef ai hex '0B90' |
| stringdef o hex '0B92' |
| stringdef oo hex '0B93' |
| stringdef au hex '0B94' |
| |
| /* Consonants */ |
| stringdef ka hex '0B95' |
| stringdef nga hex '0B99' |
| stringdef ca hex '0B9A' |
| stringdef ja hex '0B9C' |
| stringdef nya hex '0B9E' |
| stringdef tta hex '0B9F' |
| stringdef nna hex '0BA3' |
| stringdef ta hex '0BA4' |
| stringdef tha hex '0BA4' |
| stringdef na hex '0BA8' |
| stringdef nnna hex '0BA9' |
| stringdef pa hex '0BAA' |
| stringdef ma hex '0BAE' |
| stringdef ya hex '0BAF' |
| stringdef ra hex '0BB0' |
| stringdef rra hex '0BB1' |
| stringdef la hex '0BB2' |
| stringdef lla hex '0BB3' |
| stringdef llla hex '0BB4' |
| stringdef zha hex '0BB4' |
| stringdef va hex '0BB5' |
| |
| /* Vatamozi - borrowed */ |
| stringdef sha hex '0BB6' |
| stringdef ssa hex '0BB7' |
| stringdef sa hex '0BB8' |
| stringdef ha hex '0BB9' |
| |
| |
| /* Dependent vowel signs (kombu etc.) */ |
| stringdef vs_aa hex '0BBE' |
| stringdef vs_i hex '0BBF' |
| stringdef vs_ii hex '0BC0' |
| stringdef vs_u hex '0BC1' |
| stringdef vs_uu hex '0BC2' |
| stringdef vs_e hex '0BC6' |
| stringdef vs_ee hex '0BC7' |
| stringdef vs_ai hex '0BC8' |
| stringdef vs_o hex '0BCA' |
| stringdef vs_oo hex '0BCB' |
| stringdef vs_au hex '0BCC' |
| |
| /* Pulli */ |
| stringdef pulli hex '0BCD' |
| |
| /* AU length markk */ |
| stringdef au_lmark hex '0BD7' |
| |
| |
| routines ( |
| remove_plural_suffix |
| remove_question_suffixes |
| remove_question_prefixes |
| remove_pronoun_prefixes |
| remove_command_suffixes |
| remove_um |
| remove_vetrumai_urupukal |
| fix_va_start |
| fix_ending |
| fix_endings |
| remove_tense_suffix |
| remove_tense_suffixes |
| remove_common_word_endings |
| has_min_length |
| ) |
| |
| externals ( stem ) |
| |
| booleans ( |
| found_a_match |
| found_vetrumai_urupu |
| found_wrong_ending |
| ) |
| |
| integers ( |
| length |
| ) |
| |
| define has_min_length as ( |
| $length = len |
| $length > 4 |
| ) |
| |
| define fix_va_start as ( |
| (try '{va}{vs_oo}' and [ '{va}{vs_oo}' ] <- '{oo}' ) or |
| (try '{va}{vs_o}' and [ '{va}{vs_o}' ] <- '{o}' ) or |
| (try '{va}{vs_u}' and [ '{va}{vs_u}' ] <- '{u}' ) or |
| (try '{va}{vs_uu}' and [ '{va}{vs_uu}' ] <- '{uu}' ) |
| ) |
| |
| define fix_endings as ( |
| set found_wrong_ending |
| repeat (found_wrong_ending (do fix_ending)) |
| ) |
| |
| define remove_question_prefixes as ( |
| [ ('{e}' ) among('{ka}' '{ca}' '{tha}' '{va}' '{na}' '{pa}' '{ma}' '{ya}' '{nga}' '{nya}') '{pulli}' ] delete |
| do fix_va_start |
| ) |
| |
| define fix_ending as ( |
| unset found_wrong_ending |
| $length = len |
| $length > 3 |
| backwards ( |
| ( [among('{na}{pulli}' '{na}{pulli}{ta}' '{na}{pulli}{ta}{pulli}') ] delete ) |
| or |
| ( ['{ya}{pulli}' test among('{vs_ai}' '{vs_i}' '{vs_ii}') ] delete ) |
| or |
| ( [ '{tta}{pulli}{pa}{pulli}' or '{tta}{pulli}{ka}{pulli}' ] <- '{lla}{pulli}' ) |
| or |
| ( [ '{nnna}{pulli}{rra}{pulli}' ] <- '{la}{pulli}' ) |
| or |
| // ( [ '{rra}{pulli}{ka}{pulli}' or '{nnna}{pulli}{nnna}{pulli}' ] <- '{la}{pulli}' ) |
| ( [ '{rra}{pulli}{ka}{pulli}' ] <- '{la}{pulli}' ) |
| or |
| ( [ '{tta}{pulli}{tta}{pulli}' ] <- '{tta}{vs_u}' ) |
| or |
| ( found_vetrumai_urupu [ '{ta}{pulli}{ta}{pulli}' (test not '{vs_ai}') ] <- '{ma}{pulli}' ] ) |
| or |
| ( [ '{vs_u}{ka}{pulli}' or '{vs_u}{ka}{pulli}{ka}{pulli}' ] <- '{pulli}' ) |
| or |
| ( [ '{pulli}' among('{ka}' '{ca}' '{tta}' '{tha}' '{pa}' '{rra}') '{pulli}' among('{ka}' '{ca}' '{tta}' '{tha}' '{pa}' '{rra}') ] delete ) |
| or |
| ( [ '{vs_u}{ka}{pulli}' ] <- '{pulli}' ) |
| or |
| ( [ '{pulli}' among('{ka}' '{ca}' '{tta}' '{tha}' '{pa}' '{rra}') ] delete ) |
| or |
| ( [ '{pulli}' (among('{ya}' '{ra}' '{la}' '{va}' '{zha}' '{lla}') or among('{nga}' '{nya}' '{nna}' '{na}' '{ma}' '{nnna}')) '{pulli}' ] <- '{pulli}' ) |
| or |
| ( [ among('{va}' '{ya}' '{va}{pulli}') ] delete ) |
| or |
| ( [ '{nnna}{vs_u}' (test not among('{vs_aa}' '{vs_i}' '{vs_ii}' '{vs_e}' '{vs_ee}' '{vs_u}' '{vs_uu}' '{vs_ai}')) ] delete ) |
| or |
| ( [ '{nga}{pulli}' (test not '{vs_ai}')] <- '{ma}{pulli}' ) |
| or |
| ( [ '{nga}{pulli}' ] delete ) |
| or |
| ( [ '{pulli}' (test (among('{vs_aa}' '{vs_i}' '{vs_ii}' '{vs_e}' '{vs_ee}' '{vs_u}' '{vs_uu}' '{vs_ai}') or '{pulli}')) ] delete ) |
| ) |
| |
| set found_wrong_ending // If any of above test pass set the flag |
| ) |
| |
| define remove_pronoun_prefixes as ( |
| unset found_a_match |
| [ among('{a}' '{i}' '{u}') among('{ka}' '{ca}' '{tha}' '{va}' '{na}' '{pa}' '{ma}' '{ya}' '{nga}' '{nya}') '{pulli}' ] delete |
| (set found_a_match) |
| do fix_va_start |
| ) |
| |
| define remove_plural_suffix as ( |
| unset found_a_match |
| backwards ( |
| ( [ '{vs_u}{nga}{pulli}{ka}{lla}{pulli}' (test not among('{ka}' '{ca}' '{tta}' '{tha}' '{pa}' '{rra}')) ] <- '{pulli}' ) or |
| ( [ '{rra}{pulli}{ka}{lla}{pulli}' ] <- '{la}{pulli}' ) or |
| ( [ '{tta}{pulli}{ka}{lla}{pulli}' ] <- '{lla}{pulli}' ) or |
| ( [ '{ka}{lla}{pulli}' ] delete ) |
| (set found_a_match) |
| ) |
| ) |
| |
| define remove_question_suffixes as ( |
| has_min_length |
| unset found_a_match |
| backwards ( |
| do ( |
| [ among('{vs_oo}' '{vs_ee}' '{vs_aa}') ] <- '{pulli}' |
| (set found_a_match) |
| ) |
| ) |
| do fix_endings |
| ) |
| |
| define remove_command_suffixes as ( |
| has_min_length |
| unset found_a_match |
| backwards ( |
| [ among('{pa}{vs_i}' '{va}{vs_i}') ] delete |
| (set found_a_match) |
| ) |
| ) |
| |
| define remove_um as ( |
| unset found_a_match |
| has_min_length |
| backwards ( [ '{vs_u}{ma}{pulli}' ] <- '{pulli}' |
| (set found_a_match) |
| ) |
| do fix_ending |
| ) |
| |
| define remove_common_word_endings as ( |
| // These are not suffixes actually but are |
| // some words that are attached to other words |
| // but can be removed for stemming |
| unset found_a_match |
| has_min_length |
| backwards ( |
| test ( [ '{vs_u}{tta}{nnna}{pulli}' or |
| '{vs_i}{la}{pulli}{la}{vs_ai}' or |
| '{vs_i}{tta}{ma}{pulli}' or |
| '{vs_i}{nnna}{pulli}{rra}{vs_i}' or |
| '{vs_aa}{ka}{vs_i}' or |
| '{vs_aa}{ka}{vs_i}{ya}' or |
| '{vs_e}{nnna}{pulli}{rra}{vs_u}' or |
| '{vs_u}{lla}{pulli}{lla}' or |
| '{vs_u}{tta}{vs_ai}{ya}' or |
| '{vs_u}{tta}{vs_ai}' or |
| '{vs_e}{nnna}{vs_u}{ma}{pulli}' or |
| ('{la}{pulli}{la}' test (not among('{vs_aa}' '{vs_i}' '{vs_ii}' '{vs_e}' '{vs_ee}' '{vs_u}' '{vs_uu}' '{vs_ai}'))) or |
| '{vs_e}{nnna}' or |
| '{vs_aa}{ka}{vs_i}' ] <- '{pulli}' |
| (set found_a_match) |
| ) |
| or |
| test ( [ among('{pa}{tta}{vs_u}' |
| '{pa}{tta}{pulli}{tta}' |
| '{pa}{tta}{pulli}{tta}{vs_u}' |
| '{pa}{tta}{pulli}{tta}{ta}{vs_u}' |
| '{pa}{tta}{pulli}{tta}{nna}' |
| '{ka}{vs_u}{ra}{vs_i}{ya}' |
| '{pa}{rra}{pulli}{rra}{vs_i}' |
| '{va}{vs_i}{tta}{vs_u}' |
| '{va}{vs_i}{tta}{pulli}{tta}{vs_u}' |
| '{pa}{tta}{vs_i}{ta}{vs_aa}{nnna}' |
| '{pa}{tta}{vs_i}' |
| '{ta}{vs_aa}{nnna}' |
| '{vs_e}{la}{pulli}{la}{vs_aa}{ma}{pulli}') |
| ] delete |
| (set found_a_match) |
| ) |
| ) |
| do fix_endings |
| ) |
| |
| define remove_vetrumai_urupukal as ( |
| unset found_a_match |
| unset found_vetrumai_urupu |
| has_min_length |
| backwards ( |
| ( |
| test ( ['{nnna}{vs_ai}'] delete ) |
| or |
| test ([ ( '{vs_i}{nnna}{vs_ai}' or |
| '{vs_ai}' (test not among('{ka}' '{ca}' '{tta}' '{tha}' '{pa}' '{rra}'))) or |
| ( '{vs_ai}' (test (among('{ka}' '{ca}' '{tta}' '{tha}' '{pa}' '{rra}') '{pulli}'))) |
| ] <- '{pulli}' |
| ) |
| or |
| test ( [ |
| '{vs_o}{tta}{vs_u}' or |
| '{vs_oo}{tta}{vs_u}' or |
| '{vs_i}{la}{pulli}' or |
| '{vs_i}{rra}{pulli}' or |
| ('{vs_i}{nnna}{pulli}' (test not '{ma}')) or |
| '{vs_i}{nnna}{pulli}{rra}{vs_u}' or |
| '{vs_i}{ra}{vs_u}{na}{pulli}{ta}{vs_u}' or |
| '{va}{vs_i}{tta}' or |
| ($length >= 7 '{vs_i}{tta}{ma}{pulli}') or |
| '{vs_aa}{la}{pulli}' or |
| '{vs_u}{tta}{vs_ai}' or |
| '{vs_aa}{ma}{la}{pulli}' or |
| ('{la}{pulli}' (test not among('{vs_aa}' '{vs_i}' '{vs_ii}' '{vs_e}' '{vs_ee}' '{vs_u}' '{vs_uu}' '{vs_ai}'))) or |
| '{vs_u}{lla}{pulli}' |
| ] <- '{pulli}' |
| ) |
| or |
| test ( [ |
| '{ka}{nna}{pulli}' or |
| '{ma}{vs_u}{nnna}{pulli}' or |
| '{ma}{vs_ee}{la}{pulli}' or |
| '{ma}{vs_ee}{rra}{pulli}' or |
| '{ka}{vs_ii}{llla}{pulli}' or |
| '{pa}{vs_i}{nnna}{pulli}' or |
| ('{ta}{vs_u}' (test not among('{vs_aa}' '{vs_i}' '{vs_ii}' '{vs_e}' '{vs_ee}' '{vs_u}' '{vs_uu}' '{vs_ai}'))) |
| ] delete |
| ) |
| or |
| test ([ '{vs_ii}' ] <- '{vs_i}') |
| ) |
| (set found_a_match) |
| (set found_vetrumai_urupu) |
| do ( [ '{vs_i}{nnna}{pulli}' ] <- '{pulli}' ) |
| ) |
| do fix_endings |
| ) |
| |
| define remove_tense_suffixes as ( |
| set found_a_match |
| repeat ( found_a_match (do remove_tense_suffix) ) |
| ) |
| |
| define remove_tense_suffix as ( |
| unset found_a_match |
| has_min_length |
| backwards ( |
| do ( |
| test ( [among( |
| '{ka}{vs_o}{nna}{pulli}{tta}{vs_i}{ra}{pulli}' |
| '{pa}{tta}{vs_u}' |
| )] delete |
| (set found_a_match) |
| ) |
| or |
| test ( [ |
| '{ma}{vs_aa}{ra}{pulli}' or |
| '{ma}{vs_i}{nnna}{pulli}' or |
| '{nnna}{nnna}{pulli}' or |
| '{nnna}{vs_aa}{nnna}{pulli}' or |
| '{nnna}{vs_aa}{lla}{pulli}' or |
| '{nnna}{vs_aa}{ra}{pulli}' or |
| ('{va}{nnna}{pulli}' test (not among('{a}' '{aa}' '{i}' '{ii}' '{u}' '{uu}' '{e}' '{ee}' '{ai}' '{o}' '{oo}' '{au}')) ) or |
| '{nnna}{lla}{pulli}' or |
| '{va}{lla}{pulli}' or |
| '{nnna}{ra}{pulli}' or |
| '{va}{ra}{pulli}' or |
| '{nnna}' or '{pa}' or '{ka}' or '{ta}' or '{ya}' or |
| '{pa}{nnna}{pulli}' or |
| '{pa}{lla}{pulli}' or |
| '{pa}{ra}{pulli}' or |
| ('{ta}{vs_u}' (test not among('{vs_aa}' '{vs_i}' '{vs_ii}' '{vs_e}' '{vs_ee}' '{vs_u}' '{vs_uu}' '{vs_ai}'))) or |
| '{vs_i}{rra}{pulli}{rra}{vs_u}' or |
| '{pa}{ma}{pulli}' or |
| '{nnna}{ma}{pulli}' or |
| '{ta}{vs_u}{ma}{pulli}' or |
| '{rra}{vs_u}{ma}{pulli}' or |
| '{ka}{vs_u}{ma}{pulli}' or |
| '{nnna}{vs_e}{nnna}{pulli}' or |
| '{nnna}{vs_ai}' or |
| '{va}{vs_ai}' |
| ] delete |
| (set found_a_match) |
| ) |
| or |
| test ( [ |
| ('{vs_aa}{nnna}{pulli}' test (not '{ca}')) or |
| '{vs_aa}{lla}{pulli}' or |
| '{vs_aa}{ra}{pulli}' or |
| '{vs_ee}{nnna}{pulli}' or |
| '{vs_aa}' or |
| '{vs_aa}{ma}{pulli}' or |
| '{vs_e}{ma}{pulli}' or |
| '{vs_ee}{ma}{pulli}' or |
| '{vs_oo}{ma}{pulli}' or |
| '{ka}{vs_u}{ma}{pulli}' or |
| '{ta}{vs_u}{ma}{pulli}' or |
| '{tta}{vs_u}{ma}{pulli}' or |
| '{rra}{vs_u}{ma}{pulli}' or |
| '{vs_aa}{ya}{pulli}' or |
| '{nnna}{vs_e}{nnna}{pulli}' or |
| '{nnna}{vs_i}{ra}{pulli}' or |
| '{vs_ii}{ra}{pulli}' or |
| '{vs_ii}{ya}{ra}{pulli}' |
| ] <- '{pulli}' |
| (set found_a_match) |
| ) |
| or |
| test ( ([ '{ka}{vs_u}' or '{ta}{vs_u}' ) (test '{pulli}') ] delete |
| (set found_a_match) |
| ) |
| ) |
| do ([among( |
| '{vs_aa}{na}{vs_i}{nnna}{pulli}{rra}' |
| '{vs_aa}{na}{vs_i}{nnna}{pulli}{rra}{pulli}' |
| '{ka}{vs_i}{nnna}{pulli}{rra}' |
| '{ka}{vs_i}{nnna}{pulli}{rra}{pulli}' |
| '{ka}{vs_i}{rra}' |
| '{ka}{vs_i}{rra}{pulli}' |
| )] delete |
| (set found_a_match) |
| ) |
| ) |
| do fix_endings |
| ) |
| |
| define stem as ( |
| unset found_vetrumai_urupu |
| do fix_ending |
| has_min_length |
| do remove_question_prefixes |
| do remove_pronoun_prefixes |
| do remove_question_suffixes |
| do remove_um |
| do remove_common_word_endings |
| do remove_vetrumai_urupukal |
| do remove_plural_suffix |
| do remove_command_suffixes |
| do remove_tense_suffixes |
| ) |