src/lib.rs - platform/external/rust/crates/rust-stemmers - Git at Google

 //! This library provides rust implementations for some stemmer algorithms
 //! written in the [snowball language](https://snowballstem.org/).
 //!
 //!
 //! All algorithms expect the input to already be lowercased.
 //!
 //! # Usage
 //! ```toml
 //! [dependencies]
 //! rust-stemmers = "^1.0"
 //! ```
 //!
 //! ```rust
 //! extern crate rust_stemmers;
 //!
 //! use rust_stemmers::{Algorithm, Stemmer};
 //!
 //! fn main() {
 //!    let en_stemmer = Stemmer::create(Algorithm::English);
 //!    assert_eq!(en_stemmer.stem("fruitlessly"), "fruitless");
 //! }
 //! ```
 extern crate serde;
 #[macro_use]
 extern crate serde_derive;

 use std::borrow::Cow;

 mod snowball;

 use snowball::SnowballEnv;
 use snowball::algorithms;

 /// Enum of all supported algorithms.
 /// Check the [Snowball-Website](https://snowballstem.org/) for details.
 #[derive(Debug, Serialize, Deserialize, Eq, PartialEq, Copy, Clone)]
 pub enum Algorithm {
     Arabic,
     Danish,
     Dutch,
     English,
     Finnish,
     French,
     German,
     Greek,
     Hungarian,
     Italian,
     Norwegian,
     Portuguese,
     Romanian,
     Russian,
     Spanish,
     Swedish,
     Tamil,
     Turkish
 }

 /// Wrapps a usable interface around the actual stemmer implementation
 pub struct Stemmer {
     stemmer: fn(&mut SnowballEnv) -> bool,
 }

 impl Stemmer {
     /// Create a new stemmer from an algorithm
     pub fn create(lang: Algorithm) -> Self {
         match lang {
             Algorithm::Arabic => Stemmer { stemmer: algorithms::arabic::stem },
             Algorithm::Danish => Stemmer { stemmer: algorithms::danish::stem },
             Algorithm::Dutch => Stemmer { stemmer: algorithms::dutch::stem },
             Algorithm::English => Stemmer { stemmer: algorithms::english::stem },
             Algorithm::Finnish => Stemmer { stemmer: algorithms::finnish::stem },
             Algorithm::French => Stemmer { stemmer: algorithms::french::stem },
             Algorithm::German => Stemmer { stemmer: algorithms::german::stem },
             Algorithm::Greek => Stemmer { stemmer: algorithms::greek::stem },
             Algorithm::Hungarian => Stemmer { stemmer: algorithms::hungarian::stem },
             Algorithm::Italian => Stemmer { stemmer: algorithms::italian::stem },
             Algorithm::Norwegian => Stemmer { stemmer: algorithms::norwegian::stem },
             Algorithm::Portuguese => Stemmer { stemmer: algorithms::portuguese::stem },
             Algorithm::Romanian => Stemmer { stemmer: algorithms::romanian::stem },
             Algorithm::Russian => Stemmer { stemmer: algorithms::russian::stem },
             Algorithm::Spanish => Stemmer { stemmer: algorithms::spanish::stem },
             Algorithm::Swedish => Stemmer { stemmer: algorithms::swedish::stem },
             Algorithm::Tamil => Stemmer { stemmer: algorithms::tamil::stem },
             Algorithm::Turkish => Stemmer { stemmer: algorithms::turkish::stem },
         }
     }

     /// Stem a single word
     /// Please note, that the input is expected to be all lowercase (if that is applicable).
     pub fn stem<'a>(&self, input: &'a str) -> Cow<'a, str> {
         let mut env = SnowballEnv::create(input);
         (self.stemmer)(&mut env);
         env.get_current()
     }
 }


 #[cfg(test)]
 mod tests {
     use super::{Stemmer, Algorithm};

     fn stemms_to(lhs: &str, rhs: &str, stemmer: Algorithm) {
         assert_eq!(Stemmer::create(stemmer).stem(lhs), rhs);
     }

     #[test]
     fn german_test() {
         use std::fs;
         use std::io;
         use std::io::BufRead;

         let vocab = io::BufReader::new(fs::File::open("test_data/voc_ger.txt").unwrap());
         let result = io::BufReader::new(fs::File::open("test_data/res_ger.txt").unwrap());

         let lines = vocab.lines().zip(result.lines());

         for (voc, res) in lines {
             stemms_to(voc.unwrap().as_str(),
                       res.unwrap().as_str(),
                       Algorithm::German);
         }
     }

     #[test]
     fn english_test() {
         use std::fs;
         use std::io;
         use std::io::BufRead;

         let vocab = io::BufReader::new(fs::File::open("test_data/voc_en.txt").unwrap());
         let result = io::BufReader::new(fs::File::open("test_data/res_en.txt").unwrap());

         let lines = vocab.lines().zip(result.lines());

         for (voc, res) in lines {
             stemms_to(voc.unwrap().as_str(),
                       res.unwrap().as_str(),
                       Algorithm::English);
         }
     }

     #[test]
     fn french_test() {
         use std::fs;
         use std::io;
         use std::io::BufRead;

         let vocab = io::BufReader::new(fs::File::open("test_data/voc_fr.txt").unwrap());
         let result = io::BufReader::new(fs::File::open("test_data/res_fr.txt").unwrap());

         let lines = vocab.lines().zip(result.lines());

         for (voc, res) in lines {
             stemms_to(voc.unwrap().as_str(),
                       res.unwrap().as_str(),
                       Algorithm::French);
         }
     }

     #[test]
     fn spanish_test() {
         use std::fs;
         use std::io;
         use std::io::BufRead;

         let vocab = io::BufReader::new(fs::File::open("test_data/voc_es.txt").unwrap());
         let result = io::BufReader::new(fs::File::open("test_data/res_es.txt").unwrap());

         let lines = vocab.lines().zip(result.lines());

         for (voc, res) in lines {
             stemms_to(voc.unwrap().as_str(),
                       res.unwrap().as_str(),
                       Algorithm::Spanish);
         }
     }

     #[test]
     fn portuguese_test() {
         use std::fs;
         use std::io;
         use std::io::BufRead;

         let vocab = io::BufReader::new(fs::File::open("test_data/voc_pt.txt").unwrap());
         let result = io::BufReader::new(fs::File::open("test_data/res_pt.txt").unwrap());

         let lines = vocab.lines().zip(result.lines());

         for (voc, res) in lines {
             stemms_to(voc.unwrap().as_str(),
                       res.unwrap().as_str(),
                       Algorithm::Portuguese);
         }
     }

     #[test]
     fn italian_test() {
         use std::fs;
         use std::io;
         use std::io::BufRead;

         let vocab = io::BufReader::new(fs::File::open("test_data/voc_it.txt").unwrap());
         let result = io::BufReader::new(fs::File::open("test_data/res_it.txt").unwrap());

         let lines = vocab.lines().zip(result.lines());

         for (voc, res) in lines {
             stemms_to(voc.unwrap().as_str(),
                       res.unwrap().as_str(),
                       Algorithm::Italian);
         }
     }

     #[test]
     fn romanian_test() {
         use std::fs;
         use std::io;
         use std::io::BufRead;

         let vocab = io::BufReader::new(fs::File::open("test_data/voc_ro.txt").unwrap());
         let result = io::BufReader::new(fs::File::open("test_data/res_ro.txt").unwrap());

         let lines = vocab.lines().zip(result.lines());

         for (voc, res) in lines {
             stemms_to(voc.unwrap().as_str(),
                       res.unwrap().as_str(),
                       Algorithm::Romanian);
         }
     }

     #[test]
     fn russian_test() {
         use std::fs;
         use std::io;
         use std::io::BufRead;

         let vocab = io::BufReader::new(fs::File::open("test_data/voc_ru.txt").unwrap());
         let result = io::BufReader::new(fs::File::open("test_data/res_ru.txt").unwrap());

         let lines = vocab.lines().zip(result.lines());

         for (voc, res) in lines {
             stemms_to(voc.unwrap().as_str(),
                       res.unwrap().as_str(),
                       Algorithm::Russian);
         }
     }

     #[test]
     fn arabic_test() {
         use std::fs;
         use std::io;
         use std::io::BufRead;

         let vocab = io::BufReader::new(fs::File::open("test_data/voc_ar.txt").unwrap());
         let result = io::BufReader::new(fs::File::open("test_data/res_ar.txt").unwrap());

         let lines = vocab.lines().zip(result.lines());

         for (voc, res) in lines {
             stemms_to(voc.unwrap().as_str(),
                       res.unwrap().as_str(),
                       Algorithm::Arabic);
         }
     }

     #[test]
     fn finnish_test() {
         use std::fs;
         use std::io;
         use std::io::BufRead;

         let vocab = io::BufReader::new(fs::File::open("test_data/voc_fi.txt").unwrap());
         let result = io::BufReader::new(fs::File::open("test_data/res_fi.txt").unwrap());

         let lines = vocab.lines().zip(result.lines());

         for (voc, res) in lines {
             stemms_to(voc.unwrap().as_str(),
                       res.unwrap().as_str(),
                       Algorithm::Finnish);
         }
     }

     #[test]
     fn greek_test() {
         use std::fs;
         use std::io;
         use std::io::BufRead;

         let vocab = io::BufReader::new(fs::File::open("test_data/voc_el.txt").unwrap());
         let result = io::BufReader::new(fs::File::open("test_data/res_el.txt").unwrap());

         let lines = vocab.lines().zip(result.lines());

         for (voc, res) in lines {
             stemms_to(voc.unwrap().as_str(),
                       res.unwrap().as_str(),
                       Algorithm::Greek);
         }
     }

     #[test]
     fn norwegian_test() {
         use std::fs;
         use std::io;
         use std::io::BufRead;

         let vocab = io::BufReader::new(fs::File::open("test_data/voc_no.txt").unwrap());
         let result = io::BufReader::new(fs::File::open("test_data/res_no.txt").unwrap());

         let lines = vocab.lines().zip(result.lines());

         for (voc, res) in lines {
             stemms_to(voc.unwrap().as_str(),
                       res.unwrap().as_str(),
                       Algorithm::Norwegian);
         }
     }

 }
	//! This library provides rust implementations for some stemmer algorithms
	//! written in the [snowball language](https://snowballstem.org/).
	//!
	//!
	//! All algorithms expect the input to already be lowercased.
	//!
	//! # Usage
	//! ```toml
	//! [dependencies]
	//! rust-stemmers = "^1.0"
	//! ```
	//!
	//! ```rust
	//! extern crate rust_stemmers;
	//!
	//! use rust_stemmers::{Algorithm, Stemmer};
	//!
	//! fn main() {
	//! let en_stemmer = Stemmer::create(Algorithm::English);
	//! assert_eq!(en_stemmer.stem("fruitlessly"), "fruitless");
	//! }
	//! ```
	extern crate serde;
	#[macro_use]
	extern crate serde_derive;

	use std::borrow::Cow;

	mod snowball;

	use snowball::SnowballEnv;
	use snowball::algorithms;

	/// Enum of all supported algorithms.
	/// Check the [Snowball-Website](https://snowballstem.org/) for details.
	#[derive(Debug, Serialize, Deserialize, Eq, PartialEq, Copy, Clone)]
	pub enum Algorithm {
	Arabic,
	Danish,
	Dutch,
	English,
	Finnish,
	French,
	German,
	Greek,
	Hungarian,
	Italian,
	Norwegian,
	Portuguese,
	Romanian,
	Russian,
	Spanish,
	Swedish,
	Tamil,
	Turkish
	}

	/// Wrapps a usable interface around the actual stemmer implementation
	pub struct Stemmer {
	stemmer: fn(&mut SnowballEnv) -> bool,
	}

	impl Stemmer {
	/// Create a new stemmer from an algorithm
	pub fn create(lang: Algorithm) -> Self {
	match lang {
	Algorithm::Arabic => Stemmer { stemmer: algorithms::arabic::stem },
	Algorithm::Danish => Stemmer { stemmer: algorithms::danish::stem },
	Algorithm::Dutch => Stemmer { stemmer: algorithms::dutch::stem },
	Algorithm::English => Stemmer { stemmer: algorithms::english::stem },
	Algorithm::Finnish => Stemmer { stemmer: algorithms::finnish::stem },
	Algorithm::French => Stemmer { stemmer: algorithms::french::stem },
	Algorithm::German => Stemmer { stemmer: algorithms::german::stem },
	Algorithm::Greek => Stemmer { stemmer: algorithms::greek::stem },
	Algorithm::Hungarian => Stemmer { stemmer: algorithms::hungarian::stem },
	Algorithm::Italian => Stemmer { stemmer: algorithms::italian::stem },
	Algorithm::Norwegian => Stemmer { stemmer: algorithms::norwegian::stem },
	Algorithm::Portuguese => Stemmer { stemmer: algorithms::portuguese::stem },
	Algorithm::Romanian => Stemmer { stemmer: algorithms::romanian::stem },
	Algorithm::Russian => Stemmer { stemmer: algorithms::russian::stem },
	Algorithm::Spanish => Stemmer { stemmer: algorithms::spanish::stem },
	Algorithm::Swedish => Stemmer { stemmer: algorithms::swedish::stem },
	Algorithm::Tamil => Stemmer { stemmer: algorithms::tamil::stem },
	Algorithm::Turkish => Stemmer { stemmer: algorithms::turkish::stem },
	}
	}

	/// Stem a single word
	/// Please note, that the input is expected to be all lowercase (if that is applicable).
	pub fn stem<'a>(&self, input: &'a str) -> Cow<'a, str> {
	let mut env = SnowballEnv::create(input);
	(self.stemmer)(&mut env);
	env.get_current()
	}
	}



	#[cfg(test)]
	mod tests {
	use super::{Stemmer, Algorithm};

	fn stemms_to(lhs: &str, rhs: &str, stemmer: Algorithm) {
	assert_eq!(Stemmer::create(stemmer).stem(lhs), rhs);
	}

	#[test]
	fn german_test() {
	use std::fs;
	use std::io;
	use std::io::BufRead;

	let vocab = io::BufReader::new(fs::File::open("test_data/voc_ger.txt").unwrap());
	let result = io::BufReader::new(fs::File::open("test_data/res_ger.txt").unwrap());

	let lines = vocab.lines().zip(result.lines());

	for (voc, res) in lines {
	stemms_to(voc.unwrap().as_str(),
	res.unwrap().as_str(),
	Algorithm::German);
	}
	}

	#[test]
	fn english_test() {
	use std::fs;
	use std::io;
	use std::io::BufRead;

	let vocab = io::BufReader::new(fs::File::open("test_data/voc_en.txt").unwrap());
	let result = io::BufReader::new(fs::File::open("test_data/res_en.txt").unwrap());

	let lines = vocab.lines().zip(result.lines());

	for (voc, res) in lines {
	stemms_to(voc.unwrap().as_str(),
	res.unwrap().as_str(),
	Algorithm::English);
	}
	}

	#[test]
	fn french_test() {
	use std::fs;
	use std::io;
	use std::io::BufRead;

	let vocab = io::BufReader::new(fs::File::open("test_data/voc_fr.txt").unwrap());
	let result = io::BufReader::new(fs::File::open("test_data/res_fr.txt").unwrap());

	let lines = vocab.lines().zip(result.lines());

	for (voc, res) in lines {
	stemms_to(voc.unwrap().as_str(),
	res.unwrap().as_str(),
	Algorithm::French);
	}
	}

	#[test]
	fn spanish_test() {
	use std::fs;
	use std::io;
	use std::io::BufRead;

	let vocab = io::BufReader::new(fs::File::open("test_data/voc_es.txt").unwrap());
	let result = io::BufReader::new(fs::File::open("test_data/res_es.txt").unwrap());

	let lines = vocab.lines().zip(result.lines());

	for (voc, res) in lines {
	stemms_to(voc.unwrap().as_str(),
	res.unwrap().as_str(),
	Algorithm::Spanish);
	}
	}

	#[test]
	fn portuguese_test() {
	use std::fs;
	use std::io;
	use std::io::BufRead;

	let vocab = io::BufReader::new(fs::File::open("test_data/voc_pt.txt").unwrap());
	let result = io::BufReader::new(fs::File::open("test_data/res_pt.txt").unwrap());

	let lines = vocab.lines().zip(result.lines());

	for (voc, res) in lines {
	stemms_to(voc.unwrap().as_str(),
	res.unwrap().as_str(),
	Algorithm::Portuguese);
	}
	}

	#[test]
	fn italian_test() {
	use std::fs;
	use std::io;
	use std::io::BufRead;

	let vocab = io::BufReader::new(fs::File::open("test_data/voc_it.txt").unwrap());
	let result = io::BufReader::new(fs::File::open("test_data/res_it.txt").unwrap());

	let lines = vocab.lines().zip(result.lines());

	for (voc, res) in lines {
	stemms_to(voc.unwrap().as_str(),
	res.unwrap().as_str(),
	Algorithm::Italian);
	}
	}

	#[test]
	fn romanian_test() {
	use std::fs;
	use std::io;
	use std::io::BufRead;

	let vocab = io::BufReader::new(fs::File::open("test_data/voc_ro.txt").unwrap());
	let result = io::BufReader::new(fs::File::open("test_data/res_ro.txt").unwrap());

	let lines = vocab.lines().zip(result.lines());

	for (voc, res) in lines {
	stemms_to(voc.unwrap().as_str(),
	res.unwrap().as_str(),
	Algorithm::Romanian);
	}
	}

	#[test]
	fn russian_test() {
	use std::fs;
	use std::io;
	use std::io::BufRead;

	let vocab = io::BufReader::new(fs::File::open("test_data/voc_ru.txt").unwrap());
	let result = io::BufReader::new(fs::File::open("test_data/res_ru.txt").unwrap());

	let lines = vocab.lines().zip(result.lines());

	for (voc, res) in lines {
	stemms_to(voc.unwrap().as_str(),
	res.unwrap().as_str(),
	Algorithm::Russian);
	}
	}

	#[test]
	fn arabic_test() {
	use std::fs;
	use std::io;
	use std::io::BufRead;

	let vocab = io::BufReader::new(fs::File::open("test_data/voc_ar.txt").unwrap());
	let result = io::BufReader::new(fs::File::open("test_data/res_ar.txt").unwrap());

	let lines = vocab.lines().zip(result.lines());

	for (voc, res) in lines {
	stemms_to(voc.unwrap().as_str(),
	res.unwrap().as_str(),
	Algorithm::Arabic);
	}
	}

	#[test]
	fn finnish_test() {
	use std::fs;
	use std::io;
	use std::io::BufRead;

	let vocab = io::BufReader::new(fs::File::open("test_data/voc_fi.txt").unwrap());
	let result = io::BufReader::new(fs::File::open("test_data/res_fi.txt").unwrap());

	let lines = vocab.lines().zip(result.lines());

	for (voc, res) in lines {
	stemms_to(voc.unwrap().as_str(),
	res.unwrap().as_str(),
	Algorithm::Finnish);
	}
	}

	#[test]
	fn greek_test() {
	use std::fs;
	use std::io;
	use std::io::BufRead;

	let vocab = io::BufReader::new(fs::File::open("test_data/voc_el.txt").unwrap());
	let result = io::BufReader::new(fs::File::open("test_data/res_el.txt").unwrap());

	let lines = vocab.lines().zip(result.lines());

	for (voc, res) in lines {
	stemms_to(voc.unwrap().as_str(),
	res.unwrap().as_str(),
	Algorithm::Greek);
	}
	}

	#[test]
	fn norwegian_test() {
	use std::fs;
	use std::io;
	use std::io::BufRead;

	let vocab = io::BufReader::new(fs::File::open("test_data/voc_no.txt").unwrap());
	let result = io::BufReader::new(fs::File::open("test_data/res_no.txt").unwrap());

	let lines = vocab.lines().zip(result.lines());

	for (voc, res) in lines {
	stemms_to(voc.unwrap().as_str(),
	res.unwrap().as_str(),
	Algorithm::Norwegian);
	}
	}

	}