| /* Finnish stemmer. |
| |
| Numbers in square brackets refer to the sections in |
| Fred Karlsson, Finnish: An Essential Grammar. Routledge, 1999 |
| ISBN 0-415-20705-3 |
| |
| */ |
| |
| routines ( |
| mark_regions |
| R2 |
| particle_etc possessive |
| LONG VI |
| case_ending |
| i_plural |
| t_plural |
| other_endings |
| tidy |
| ) |
| |
| externals ( stem ) |
| |
| integers ( p1 p2 ) |
| strings ( x ) |
| booleans ( ending_removed ) |
| groupings ( AEI V1 V2 particle_end ) |
| |
| stringescapes {} |
| |
| /* special characters (in ISO Latin I) */ |
| |
| stringdef a" hex 'E4' |
| stringdef o" hex 'F6' |
| |
| define AEI 'a{a"}ei' |
| define V1 'aeiouy{a"}{o"}' |
| define V2 'aeiou{a"}{o"}' |
| define particle_end V1 + 'nt' |
| |
| define mark_regions as ( |
| |
| $p1 = limit |
| $p2 = limit |
| |
| goto V1 gopast non-V1 setmark p1 |
| goto V1 gopast non-V1 setmark p2 |
| ) |
| |
| backwardmode ( |
| |
| define R2 as $p2 <= cursor |
| |
| define particle_etc as ( |
| setlimit tomark p1 for ([substring]) |
| among( |
| 'kin' |
| 'kaan' 'k{a"}{a"}n' |
| 'ko' 'k{o"}' |
| 'han' 'h{a"}n' |
| 'pa' 'p{a"}' // Particles [91] |
| (particle_end) |
| 'sti' // Adverb [87] |
| (R2) |
| ) |
| delete |
| ) |
| define possessive as ( // [36] |
| setlimit tomark p1 for ([substring]) |
| among( |
| 'si' |
| (not 'k' delete) // take 'ksi' as the Comitative case |
| 'ni' |
| (delete ['kse'] <- 'ksi') // kseni = ksi + ni |
| 'nsa' 'ns{a"}' |
| 'mme' |
| 'nne' |
| (delete) |
| /* Now for Vn possessives after case endings: [36] */ |
| 'an' |
| (among('ta' 'ssa' 'sta' 'lla' 'lta' 'na') delete) |
| '{a"}n' |
| (among('t{a"}' 'ss{a"}' 'st{a"}' |
| 'll{a"}' 'lt{a"}' 'n{a"}') delete) |
| 'en' |
| (among('lle' 'ine') delete) |
| ) |
| ) |
| |
| define LONG as |
| among('aa' 'ee' 'ii' 'oo' 'uu' '{a"}{a"}' '{o"}{o"}') |
| |
| define VI as ('i' V2) |
| |
| define case_ending as ( |
| setlimit tomark p1 for ([substring]) |
| among( |
| 'han' ('a') //-. |
| 'hen' ('e') // | |
| 'hin' ('i') // | |
| 'hon' ('o') // | |
| 'h{a"}n' ('{a"}') // Illative [43] |
| 'h{o"}n' ('{o"}') // | |
| 'siin' VI // | |
| 'seen' LONG //-' |
| |
| 'den' VI |
| 'tten' VI // Genitive plurals [34] |
| () |
| 'n' // Genitive or Illative |
| ( try ( LONG // Illative |
| or 'ie' // Genitive |
| and next ] |
| ) |
| /* otherwise Genitive */ |
| ) |
| |
| 'a' '{a"}' //-. |
| (V1 non-V1) // | |
| 'tta' 'tt{a"}' // Partitive [32] |
| ('e') // | |
| 'ta' 't{a"}' //-' |
| |
| 'ssa' 'ss{a"}' // Inessive [41] |
| 'sta' 'st{a"}' // Elative [42] |
| |
| 'lla' 'll{a"}' // Adessive [44] |
| 'lta' 'lt{a"}' // Ablative [51] |
| 'lle' // Allative [46] |
| 'na' 'n{a"}' // Essive [49] |
| 'ksi' // Translative[50] |
| 'ine' // Comitative [51] |
| |
| /* Abessive and Instructive are too rare for |
| inclusion [51] */ |
| |
| ) |
| delete |
| set ending_removed |
| ) |
| define other_endings as ( |
| setlimit tomark p2 for ([substring]) |
| among( |
| 'mpi' 'mpa' 'mp{a"}' |
| 'mmi' 'mma' 'mm{a"}' // Comparative forms [85] |
| (not 'po') //-improves things |
| 'impi' 'impa' 'imp{a"}' |
| 'immi' 'imma' 'imm{a"}' // Superlative forms [86] |
| 'eja' 'ej{a"}' // indicates agent [93.1B] |
| ) |
| delete |
| ) |
| define i_plural as ( // [26] |
| setlimit tomark p1 for ([substring]) |
| among( |
| 'i' 'j' |
| ) |
| delete |
| ) |
| define t_plural as ( // [26] |
| setlimit tomark p1 for ( |
| ['t'] test V1 |
| delete |
| ) |
| setlimit tomark p2 for ([substring]) |
| among( |
| 'mma' (not 'po') //-mmat endings |
| 'imma' //-immat endings |
| ) |
| delete |
| ) |
| define tidy as ( |
| setlimit tomark p1 for ( |
| do ( LONG and ([next] delete ) ) // undouble vowel |
| do ( [AEI] non-V1 delete ) // remove trailing a, a", e, i |
| do ( ['j'] 'o' or 'u' delete ) |
| do ( ['o'] 'j' delete ) |
| ) |
| goto non-V1 [next] -> x x delete // undouble consonant |
| ) |
| ) |
| |
| define stem as ( |
| |
| do mark_regions |
| unset ending_removed |
| backwards ( |
| do particle_etc |
| do possessive |
| do case_ending |
| do other_endings |
| (ending_removed do i_plural) or do t_plural |
| do tidy |
| ) |
| ) |