blob: 1cff66ba86b4d882ef9d71f9a5984be217821806 [file] [log] [blame]
/* Finnish stemmer.
Numbers in square brackets refer to the sections in
Fred Karlsson, Finnish: An Essential Grammar. Routledge, 1999
ISBN 0-415-20705-3
*/
routines (
mark_regions
R2
particle_etc possessive
LONG VI
case_ending
i_plural
t_plural
other_endings
tidy
)
externals ( stem )
integers ( p1 p2 )
strings ( x )
booleans ( ending_removed )
groupings ( AEI V1 V2 particle_end )
stringescapes {}
/* special characters (in ISO Latin I) */
stringdef a" hex 'E4'
stringdef o" hex 'F6'
define AEI 'a{a"}ei'
define V1 'aeiouy{a"}{o"}'
define V2 'aeiou{a"}{o"}'
define particle_end V1 + 'nt'
define mark_regions as (
$p1 = limit
$p2 = limit
goto V1 gopast non-V1 setmark p1
goto V1 gopast non-V1 setmark p2
)
backwardmode (
define R2 as $p2 <= cursor
define particle_etc as (
setlimit tomark p1 for ([substring])
among(
'kin'
'kaan' 'k{a"}{a"}n'
'ko' 'k{o"}'
'han' 'h{a"}n'
'pa' 'p{a"}' // Particles [91]
(particle_end)
'sti' // Adverb [87]
(R2)
)
delete
)
define possessive as ( // [36]
setlimit tomark p1 for ([substring])
among(
'si'
(not 'k' delete) // take 'ksi' as the Comitative case
'ni'
(delete ['kse'] <- 'ksi') // kseni = ksi + ni
'nsa' 'ns{a"}'
'mme'
'nne'
(delete)
/* Now for Vn possessives after case endings: [36] */
'an'
(among('ta' 'ssa' 'sta' 'lla' 'lta' 'na') delete)
'{a"}n'
(among('t{a"}' 'ss{a"}' 'st{a"}'
'll{a"}' 'lt{a"}' 'n{a"}') delete)
'en'
(among('lle' 'ine') delete)
)
)
define LONG as
among('aa' 'ee' 'ii' 'oo' 'uu' '{a"}{a"}' '{o"}{o"}')
define VI as ('i' V2)
define case_ending as (
setlimit tomark p1 for ([substring])
among(
'han' ('a') //-.
'hen' ('e') // |
'hin' ('i') // |
'hon' ('o') // |
'h{a"}n' ('{a"}') // Illative [43]
'h{o"}n' ('{o"}') // |
'siin' VI // |
'seen' LONG //-'
'den' VI
'tten' VI // Genitive plurals [34]
()
'n' // Genitive or Illative
( try ( LONG // Illative
or 'ie' // Genitive
and next ]
)
/* otherwise Genitive */
)
'a' '{a"}' //-.
(V1 non-V1) // |
'tta' 'tt{a"}' // Partitive [32]
('e') // |
'ta' 't{a"}' //-'
'ssa' 'ss{a"}' // Inessive [41]
'sta' 'st{a"}' // Elative [42]
'lla' 'll{a"}' // Adessive [44]
'lta' 'lt{a"}' // Ablative [51]
'lle' // Allative [46]
'na' 'n{a"}' // Essive [49]
'ksi' // Translative[50]
'ine' // Comitative [51]
/* Abessive and Instructive are too rare for
inclusion [51] */
)
delete
set ending_removed
)
define other_endings as (
setlimit tomark p2 for ([substring])
among(
'mpi' 'mpa' 'mp{a"}'
'mmi' 'mma' 'mm{a"}' // Comparative forms [85]
(not 'po') //-improves things
'impi' 'impa' 'imp{a"}'
'immi' 'imma' 'imm{a"}' // Superlative forms [86]
'eja' 'ej{a"}' // indicates agent [93.1B]
)
delete
)
define i_plural as ( // [26]
setlimit tomark p1 for ([substring])
among(
'i' 'j'
)
delete
)
define t_plural as ( // [26]
setlimit tomark p1 for (
['t'] test V1
delete
)
setlimit tomark p2 for ([substring])
among(
'mma' (not 'po') //-mmat endings
'imma' //-immat endings
)
delete
)
define tidy as (
setlimit tomark p1 for (
do ( LONG and ([next] delete ) ) // undouble vowel
do ( [AEI] non-V1 delete ) // remove trailing a, a", e, i
do ( ['j'] 'o' or 'u' delete )
do ( ['o'] 'j' delete )
)
goto non-V1 [next] -> x x delete // undouble consonant
)
)
define stem as (
do mark_regions
unset ending_removed
backwards (
do particle_etc
do possessive
do case_ending
do other_endings
(ending_removed do i_plural) or do t_plural
do tidy
)
)