blob: 694721c03f7094766897af0e40df209e6a05b73d [file] [log] [blame] [edit]
/*
* Authors:
* - Assem Chelli, < assem [dot] ch [at] gmail >
* - Abdelkrim Aries <ab [underscore] aries [at] esi [dot] dz>
*
*/
stringescapes { }
/* the Arabic letters in Unicode */
// Hamza
stringdef o hex '621' // Hamza
stringdef ao hex '623' // Hamza above Alef
stringdef ao_ hex '625' // Hamza below Alef
stringdef a~ hex '622' // Alef madda
stringdef wo hex '624' // Hamza above waw
stringdef yo hex '626' // Hamza above yeh
// Letters
stringdef a hex '627' // Alef
stringdef a_ hex '649' // Alef Maksura
stringdef b hex '628' // Beh
stringdef t_ hex '629' // Teh_Marbuta
stringdef t hex '62a' // Teh
stringdef th hex '62b' // Theh
stringdef j hex '62c' // Jeem
stringdef h hex '62d' // Hah
stringdef x hex '62e' // Khah
stringdef d hex '62f' // Dal
stringdef dz hex '630' // Thal
stringdef r hex '631' // Reh
stringdef z hex '632' // Zain
stringdef s hex '633' // Seen
stringdef sh hex '634' // Sheen
stringdef c hex '635' // Sad
stringdef dh hex '636' // Dad
stringdef tt hex '637' // Tah
stringdef zh hex '638' // Zah
stringdef i hex '639' // Ain
stringdef gh hex '63a' // Ghain
stringdef f hex '641' // Feh
stringdef q hex '642' // Qaf
stringdef k hex '643' // Kaf
stringdef l hex '644' // Lam
stringdef m hex '645' // Meem
stringdef n hex '646' // Noon
stringdef e hex '647' // Heh
stringdef w hex '648' // Waw
stringdef y hex '64a' // Yeh
// Diacritics
stringdef aan hex '64b' // FatHatan
stringdef uun hex '64c' // Dammatan
stringdef iin hex '64d' // Kasratan
stringdef aa hex '64e' // FatHa
stringdef uu hex '64f' // Damma
stringdef ii hex '650' // Kasra
stringdef oo hex '652' // Sukun
stringdef ~ hex '651' // Shadda
// Hindu–Arabic numerals
stringdef 0 hex '0660'
stringdef 1 hex '0661'
stringdef 2 hex '0662'
stringdef 3 hex '0663'
stringdef 4 hex '0664'
stringdef 5 hex '0665'
stringdef 6 hex '0666'
stringdef 7 hex '0667'
stringdef 8 hex '0668'
stringdef 9 hex '0669'
stringdef % hex '066a' // PERCENT
stringdef . hex '066b' // DECIMAL
stringdef ' hex '066c' // THOUSANDS
// Kasheeda
stringdef _ hex '640' // Kasheeda, Tatweel
// Ponctuation marks
stringdef , hex '060C' // COMMA
stringdef ; hex '061B' // SEMICOLON
stringdef ? hex '061F' // QUESTION
// Shaped forms
stringdef o1 hex 'fe80' // HAMZA
stringdef ao1 hex 'fe83' // ALEF_HAMZA_ABOVE
stringdef ao2 hex 'fe84' // ALEF_HAMZA_ABOVE
stringdef ao_1 hex 'fe87' // ALEF_HAMZA_BELOW
stringdef ao_2 hex 'fe88' // ALEF_HAMZA_BELOW
stringdef yo1 hex 'fe8b' // YEH_HAMZA
stringdef yo2 hex 'fe8c' // YEH_HAMZA
stringdef yo3 hex 'fe89' // YEH_HAMZA
stringdef yo4 hex 'fe8a' // YEH_HAMZA
stringdef a~1 hex 'fe81' // ALEF_MADDA
stringdef a~2 hex 'fe82' // ALEF_MADDA
stringdef wo1 hex 'fe85' // WAW_HAMZA
stringdef wo2 hex 'fe86' // WAW_HAMZA
stringdef a1 hex 'fe8d' // ALEF
stringdef a2 hex 'fe8e' // ALEF
stringdef b1 hex 'fe8f' // BEH
stringdef b2 hex 'fe90' // BEH
stringdef b3 hex 'fe91' // BEH
stringdef b4 hex 'fe92' // BEH
stringdef t_1 hex 'fe93' // TEH_MARBUTA
stringdef t_2 hex 'fe94' // TEH_MARBUTA
stringdef t1 hex 'fe97' // TEH
stringdef t2 hex 'fe98' // TEH
stringdef t3 hex 'fe95' // TEH
stringdef t4 hex 'fe96' // TEH
stringdef th1 hex 'fe9b' // THEH
stringdef th2 hex 'fe9c' // THEH
stringdef th3 hex 'fe9a' // THEH
stringdef th4 hex 'fe99' // THEH
stringdef j1 hex 'fe9f' // JEEM
stringdef j2 hex 'fea0' // JEEM
stringdef j3 hex 'fe9d' // JEEM
stringdef j4 hex 'fe9e' // JEEM
stringdef h1 hex 'fea3' // HAH
stringdef h2 hex 'fea4' // HAH
stringdef h3 hex 'fea1' // HAH
stringdef h4 hex 'fea2' // HAH
stringdef x1 hex 'fea7' // KHAH
stringdef x2 hex 'fea8' // KHAH
stringdef x3 hex 'fea5' // KHAH
stringdef x4 hex 'fea6' // KHAH
stringdef d1 hex 'fea9' // DAL
stringdef d2 hex 'feaa' // DAL
stringdef dz1 hex 'feab' // THAL
stringdef dz2 hex 'feac' // THAL
stringdef r1 hex 'fead' // REH
stringdef r2 hex 'feae' // REH
stringdef z1 hex 'feaf' // ZAIN
stringdef z2 hex 'feb0' // ZAIN
stringdef s1 hex 'feb3' // SEEN
stringdef s2 hex 'feb4' // SEEN
stringdef s3 hex 'feb1' // SEEN
stringdef s4 hex 'feb2' // SEEN
stringdef sh1 hex 'feb7' // SHEEN
stringdef sh2 hex 'feb8' // SHEEN
stringdef sh3 hex 'feb5' // SHEEN
stringdef sh4 hex 'feb6' // SHEEN
stringdef c1 hex 'febb' // SAD
stringdef c2 hex 'febc' // SAD
stringdef c3 hex 'feb9' // SAD
stringdef c4 hex 'feba' // SAD
stringdef dh1 hex 'febf' // DAD
stringdef dh2 hex 'fec0' // DAD
stringdef dh3 hex 'febd' // DAD
stringdef dh4 hex 'febe' // DAD
stringdef tt1 hex 'fec3' // TAH
stringdef tt2 hex 'fec4' // TAH
stringdef tt3 hex 'fec1' // TAH
stringdef tt4 hex 'fec2' // TAH
stringdef zh1 hex 'fec7' // ZAH
stringdef zh2 hex 'fec8' // ZAH
stringdef zh3 hex 'fec5' // ZAH
stringdef zh4 hex 'fec6' // ZAH
stringdef i1 hex 'fecb' // AIN
stringdef i2 hex 'fecc' // AIN
stringdef i3 hex 'fec9' // AIN
stringdef i4 hex 'feca' // AIN
stringdef gh1 hex 'fecf' // GHAIN
stringdef gh2 hex 'fed0' // GHAIN
stringdef gh3 hex 'fecd' // GHAIN
stringdef gh4 hex 'fece' // GHAIN
stringdef f1 hex 'fed3' // FEH
stringdef f2 hex 'fed4' // FEH
stringdef f3 hex 'fed1' // FEH
stringdef f4 hex 'fed2' // FEH
stringdef q1 hex 'fed7' // QAF
stringdef q2 hex 'fed8' // QAF
stringdef q3 hex 'fed5' // QAF
stringdef q4 hex 'fed6' // QAF
stringdef k1 hex 'fedb' // KAF
stringdef k2 hex 'fedc' // KAF
stringdef k3 hex 'fed9' // KAF
stringdef k4 hex 'feda' // KAF
stringdef l1 hex 'fedf' // LAM
stringdef l2 hex 'fee0' // LAM
stringdef l3 hex 'fedd' // LAM
stringdef l4 hex 'fede' // LAM
stringdef m1 hex 'fee3' // MEEM
stringdef m2 hex 'fee4' // MEEM
stringdef m3 hex 'fee1' // MEEM
stringdef m4 hex 'fee2' // MEEM
stringdef n1 hex 'fee7' // NOON
stringdef n2 hex 'fee8' // NOON
stringdef n3 hex 'fee5' // NOON
stringdef n4 hex 'fee6' // NOON
stringdef e1 hex 'feeb' // HEH
stringdef e2 hex 'feec' // HEH
stringdef e3 hex 'fee9' // HEH
stringdef e4 hex 'feea' // HEH
stringdef w1 hex 'feed' // WAW
stringdef w2 hex 'feee' // WAW
stringdef a_1 hex 'feef' // ALEF_MAKSURA
stringdef a_2 hex 'fef0' // ALEF_MAKSURA
stringdef y1 hex 'fef3' // YEH
stringdef y2 hex 'fef4' // YEH
stringdef y3 hex 'fef1' // YEH
stringdef y4 hex 'fef2' // YEH
// Ligatures Lam-Alef
stringdef la hex 'fefb' // LAM_ALEF
stringdef la2 hex 'fefc' // LAM_ALEF
stringdef lao hex 'fef7' // LAM_ALEF_HAMZA_ABOVE
stringdef lao2 hex 'fef8' // LAM_ALEF_HAMZA_ABOVE
stringdef lao_ hex 'fef9' // LAM_ALEF_HAMZA_BELOW
stringdef lao_2 hex 'fefa' // LAM_ALEF_HAMZA_BELOW
stringdef la~ hex 'fef5' // LAM_ALEF_MADDA_ABOVE
stringdef la~2 hex 'fef6' // LAM_ALEF_MADDA_ABOVE
integers (
word_len
)
booleans (
is_noun
is_verb
is_defined
)
routines (
Prefix_Step1
Prefix_Step2
Prefix_Step3a_Noun
Prefix_Step3b_Noun
Prefix_Step3_Verb
Prefix_Step4_Verb
Suffix_All_alef_maqsura
Suffix_Noun_Step1a
Suffix_Noun_Step1b
Suffix_Noun_Step2a
Suffix_Noun_Step2b
Suffix_Noun_Step2c1
Suffix_Noun_Step2c2
Suffix_Noun_Step3
Suffix_Verb_Step1
Suffix_Verb_Step2a
Suffix_Verb_Step2b
Suffix_Verb_Step2c
Normalize_post
Normalize_pre
Checks1
)
externals ( stem )
groupings ( )
// Normalizations
define Normalize_pre as (
loop len (
(
[substring] among (
'{aan}' '{uun}' '{iin}' '{aa}' '{uu}' '{ii}' '{oo}' '{~}'( delete ) // strip vocalization
'{_}' ( delete ) // strip kasheeda
// Ponctuation marks
'.' ',' ';' ':' '?' '!' '/' '*' '%' '\' '"' ( delete) // General
'{,}' '{;}' '{?}' ( delete ) // Arabic-specific
// Hindu–Arabic numerals
'{0}' ( <- '0')
'{1}' ( <- '1')
'{2}' ( <- '2')
'{3}' ( <- '3')
'{4}' ( <- '4')
'{5}' ( <- '5')
'{6}' ( <- '6')
'{7}' ( <- '7')
'{8}' ( <- '8')
'{9}' ( <- '9')
'{%}' '{.}' '{'}' ( delete )
// Shaped forms
'{o1}' ( <- '{o}' ) // HAMZA
'{ao1}' '{ao2}' ( <- '{ao}' ) // ALEF_HAMZA_ABOVE
'{ao_1}' '{ao_2}' ( <- '{ao_}' ) // ALEF_HAMZA_BELOW
'{yo1}' '{yo2}' '{yo3}' '{yo4}' ( <- '{yo}' ) // YEH_HAMZA
'{a~1}' '{a~2}'( <- '{a~}' ) // ALEF_MADDA
'{wo1}' '{wo2}'( <- '{wo}' ) // WAW_HAMZA
'{a1}' '{a2}' ( <- '{a}' ) // ALEF
'{b1}' '{b2}' '{b3}' '{b4}' ( <- '{b}' ) // BEH
'{t_1}' '{t_2}' ( <- '{t_}' ) // TEH_MARBUTA
'{t1}' '{t2}' '{t3}' '{t4}' ( <- '{t}' ) // TEH
'{th1}' '{th2}' '{th3}' '{th4}' ( <- '{th}' ) // THEH
'{j1}' '{j2}' '{j3}' '{j4}'( <- '{j}' ) // JEEM
'{h1}' '{h2}' '{h3}' '{h4}' ( <- '{h}' ) // HAH
'{x1}' '{x2}' '{x3}' '{x4}'( <- '{x}' ) // KHAH
'{d1}' '{d2}' ( <- '{d}' ) // DAL
'{dz1}''{dz2}' ( <- '{dz}' ) // THAL
'{r1}' '{r2}'( <- '{r}' ) // REH
'{z1}' '{z2}' ( <- '{z}' ) // ZAIN
'{s1}' '{s2}' '{s3}' '{s4}'( <- '{s}' ) // SEEN
'{sh1}' '{sh2}' '{sh3}' '{sh4}' ( <- '{sh}' ) // SHEEN
'{c1}' '{c2}' '{c3}' '{c4}'( <- '{c}' ) // SAD
'{dh1}' '{dh2}' '{dh3}' '{dh4}'( <- '{dh}' ) // DAD
'{tt1}' '{tt2}' '{tt3}' '{tt4}' ( <- '{tt}' ) // TAH
'{zh1}' '{zh2}' '{zh3}' '{zh4}'( <- '{zh}' ) // ZAH
'{i1}' '{i2}' '{i3}' '{i4}'( <- '{i}' ) // AIN
'{gh1}' '{gh2}' '{gh3}' '{gh4}'( <- '{gh}' ) // GHAIN
'{f1}' '{f2}' '{f3}' '{f4}' ( <- '{f}' ) // FEH
'{q1}' '{q2}' '{q3}' '{q4}' ( <- '{q}' ) // QAF
'{k1}' '{k2}' '{k3}' '{k4}'( <- '{k}' ) // KAF
'{l1}' '{l2}' '{l3}' '{l4}'( <- '{l}' ) // LAM
'{m1}' '{m2}' '{m3}' '{m4}' ( <- '{m}' ) // MEEM
'{n1}' '{n2}' '{n3}' '{n4}'( <- '{n}' ) // NOON
'{e1}' '{e2}' '{e3}' '{e4}' ( <- '{e}' ) // HEH
'{w1}' '{w2}' ( <- '{w}' ) // WAW
'{a_1}' '{a_2}' ( <- '{a_}' ) // ALEF_MAKSURA
'{y1}' '{y2}' '{y3}' '{y4}' ( <- '{y}' ) // YEH
// Ligatures Lam-Alef
'{la}' '{la2}' (<- '{l}{a}')
'{lao}' '{lao2}' (<- '{l}{ao}')
'{lao_}' '{lao_2}' (<- '{l}{ao_}')
'{la~}' '{la~2}' (<- '{l}{a~}')
)
)
or
next
)
)
define Normalize_post as (
do (
// normalize last hamza
backwards (
[substring] among (
'{ao}''{ao_}' '{a~}' ( <- '{o}')
'{wo}' ( <- '{o}')
'{yo}' ( <- '{o}')
)
)
)
do loop word_len (
(
// normalize other hamza's
[substring] among (
'{ao}''{ao_}' '{a~}' ( <- '{a}')
'{wo}' ( <- '{w}')
'{yo}' ( <- '{y}')
)
)
or
next
)
)
// Checks
define Checks1 as (
$word_len = len
[substring] among (
'{b}{a}{l}' '{k}{a}{l}' ($word_len > 4 set is_noun unset is_verb set is_defined)
'{l}{l}' '{a}{l}' ($word_len > 3 set is_noun unset is_verb set is_defined)
)
)
//prefixes
define Prefix_Step1 as (
$word_len = len
[substring] among (
'{ao}{ao}' ($word_len > 3 <- '{ao}' )
'{ao}{a~}' ($word_len > 3 <- '{a~}' )
'{ao}{wo}' ($word_len > 3 <- '{ao}' )
'{ao}{a}' ($word_len > 3 <- '{a}' )
'{ao}{ao_}' ($word_len > 3 <- '{ao_}' )
// '{ao}' ($word_len > 3 delete) //rare case
)
)
define Prefix_Step2 as (
$word_len = len
not '{f}{a}'
not '{w}{a}'
[substring] among (
'{f}' ($word_len > 3 delete)
'{w}' ($word_len > 3 delete)
)
)
define Prefix_Step3a_Noun as ( // it is noun and defined
$word_len = len
[substring] among (
'{b}{a}{l}' '{k}{a}{l}' ($word_len > 5 delete)
'{l}{l}' '{a}{l}' ($word_len > 4 delete)
)
)
define Prefix_Step3b_Noun as ( // probably noun and defined
$word_len = len
not '{b}{a}' // exception
[substring] among (
'{b}' ($word_len > 3 delete)
// '{k}' '{l}' ($word_len > 3 delete) // BUG: cause confusion
'{b}{b}' ($word_len > 3 <- '{b}' )
'{k}{k}' ($word_len > 3 <- '{k}' )
)
)
define Prefix_Step3_Verb as (
$word_len = len
[substring] among (
//'{s}' ($word_len > 4 delete)// BUG: cause confusion
'{s}{y}' ($word_len > 4 <- '{y}' )
'{s}{t}' ($word_len > 4 <- '{t}')
'{s}{n}' ($word_len > 4 <- '{n}')
'{s}{ao}' ($word_len > 4 <- '{ao}')
)
)
define Prefix_Step4_Verb as (
$word_len = len
[substring] among (
'{y}{s}{t}' '{n}{s}{t}' '{t}{s}{t}' ($word_len > 4 set is_verb unset is_noun <- '{a}{s}{t}' )
)
)
// suffixes
backwardmode (
define Suffix_Noun_Step1a as (
$word_len = len
[substring] among (
'{y}' '{k}' '{e}' ($word_len >= 4 delete)
'{n}{a}' '{k}{m}' '{e}{a}' '{e}{n}' '{e}{m}' ($word_len >= 5 delete)
'{k}{m}{a}' '{e}{m}{a}' ($word_len >= 6 delete)
)
)
define Suffix_Noun_Step1b as (
$word_len = len
[substring] among (
'{n}' ($word_len > 5 delete)
)
)
define Suffix_Noun_Step2a as (
$word_len = len
[substring] among (
'{a}' '{y}' '{w}' ($word_len > 4 delete)
)
)
define Suffix_Noun_Step2b as (
$word_len = len
[substring] among (
'{a}{t}' ($word_len >= 5 delete)
)
)
define Suffix_Noun_Step2c1 as (
$word_len = len
[substring] among (
'{t}' ($word_len >= 4 delete)
)
)
define Suffix_Noun_Step2c2 as ( // feminine t_
$word_len = len
[substring] among (
'{t_}' ($word_len >= 4 delete)
)
)
define Suffix_Noun_Step3 as ( // ya' nisbiya
$word_len = len
[substring] among (
'{y}' ($word_len >= 3 delete)
)
)
define Suffix_Verb_Step1 as (
$word_len = len
[substring] among (
'{e}' '{k}' ($word_len >= 4 delete)
'{n}{y}' '{n}{a}' '{e}{a}' '{e}{m}' '{e}{n}' '{k}{m}' '{k}{n}' ($word_len >= 5 delete)
'{e}{m}{a}' '{k}{m}{a}' '{k}{m}{w}'($word_len >= 6 delete)
)
)
define Suffix_Verb_Step2a as (
$word_len = len
[substring] among (
'{t}' ($word_len >= 4 delete)
'{a}' '{n}' '{y}' ($word_len >= 4 delete)
'{n}{a}' '{t}{a}' '{t}{n}' ($word_len >= 5 delete)// past
'{a}{n}' '{w}{n}' '{y}{n}' ($word_len > 5 delete) // present
'{t}{m}{a}' ($word_len >= 6 delete)
)
)
define Suffix_Verb_Step2b as (
$word_len = len
[substring] among (
'{w}{a}' '{t}{m}' ($word_len >= 5 delete) // len >= 5
)
)
define Suffix_Verb_Step2c as (
$word_len = len
[substring] among (
'{w}' ($word_len >= 4 delete)
'{t}{m}{w}' ($word_len >= 6 delete)
)
)
define Suffix_All_alef_maqsura as (
$word_len = len
[substring] among (
'{a_}' ( <- '{y}' ) // spell error
// '{a_}' ( delete ) // if noun > 3
// '{a_}' ( <- '{a}') // if verb
)
)
)
define stem as (
// set initial values
set is_noun
set is_verb
unset is_defined
// guess type and properties
do Checks1
// normalization pre-stemming
do Normalize_pre
backwards (
do (
//Suffixes for verbs
(
is_verb
(
(
(atleast 1 Suffix_Verb_Step1)
( Suffix_Verb_Step2a or Suffix_Verb_Step2c or next)
)
or Suffix_Verb_Step2b
or Suffix_Verb_Step2a
)
)
//Suffixes for nouns
or (
is_noun
(
try (
Suffix_Noun_Step2c2
or (not is_defined Suffix_Noun_Step1a (
Suffix_Noun_Step2a
or Suffix_Noun_Step2b
or Suffix_Noun_Step2c1
or next))
or (Suffix_Noun_Step1b (
Suffix_Noun_Step2a
or Suffix_Noun_Step2b
or Suffix_Noun_Step2c1))
or (not is_defined Suffix_Noun_Step2a)
or (Suffix_Noun_Step2b)
)
Suffix_Noun_Step3
)
)
// Suffixes for alef maqsura
or Suffix_All_alef_maqsura
)
)
//Prefixes
do (
try Prefix_Step1
try Prefix_Step2
( Prefix_Step3a_Noun
or (is_noun Prefix_Step3b_Noun)
or (is_verb try Prefix_Step3_Verb Prefix_Step4_Verb)
)
)
// normalization post-stemming
do Normalize_post
)