| #!/bin/sh |
| |
| set -e |
| D="$(dirname "$0")" |
| |
| # Convenience function for checking that a command exists. |
| requires() { |
| cmd="$1" |
| if ! command -v "$cmd" > /dev/null 2>&1; then |
| echo "DEPENDENCY MISSING: $cmd must be installed" >&2 |
| exit 1 |
| fi |
| } |
| |
| # Test if an array ($2) contains a particular element ($1). |
| array_exists() { |
| needle="$1" |
| shift |
| |
| for el in "$@"; do |
| if [ "$el" = "$needle" ]; then |
| return 0 |
| fi |
| done |
| return 1 |
| } |
| |
| graphemes() { |
| regex="$(sh "$D/regex/grapheme.sh")" |
| |
| echo "generating forward grapheme DFA" |
| ucd-generate dfa \ |
| --name GRAPHEME_BREAK_FWD \ |
| --sparse --minimize --anchored --state-size 2 \ |
| src/unicode/fsm/ \ |
| "$regex" |
| |
| echo "generating reverse grapheme DFA" |
| ucd-generate dfa \ |
| --name GRAPHEME_BREAK_REV \ |
| --reverse --longest \ |
| --sparse --minimize --anchored --state-size 2 \ |
| src/unicode/fsm/ \ |
| "$regex" |
| } |
| |
| words() { |
| regex="$(sh "$D/regex/word.sh")" |
| |
| echo "generating forward word DFA (this can take a while)" |
| ucd-generate dfa \ |
| --name WORD_BREAK_FWD \ |
| --sparse --minimize --anchored --state-size 4 \ |
| src/unicode/fsm/ \ |
| "$regex" |
| } |
| |
| sentences() { |
| regex="$(sh "$D/regex/sentence.sh")" |
| |
| echo "generating forward sentence DFA (this can take a while)" |
| ucd-generate dfa \ |
| --name SENTENCE_BREAK_FWD \ |
| --minimize \ |
| --sparse --anchored --state-size 4 \ |
| src/unicode/fsm/ \ |
| "$regex" |
| } |
| |
| regional_indicator() { |
| # For finding all occurrences of region indicators. This is used to handle |
| # regional indicators as a special case for the reverse grapheme iterator |
| # and the reverse word iterator. |
| echo "generating regional indicator DFA" |
| ucd-generate dfa \ |
| --name REGIONAL_INDICATOR_REV \ |
| --reverse \ |
| --classes --minimize --anchored --premultiply --state-size 1 \ |
| src/unicode/fsm/ \ |
| "\p{gcb=Regional_Indicator}" |
| } |
| |
| simple_word() { |
| echo "generating forward simple word DFA" |
| ucd-generate dfa \ |
| --name SIMPLE_WORD_FWD \ |
| --sparse --minimize --state-size 2 \ |
| src/unicode/fsm/ \ |
| "\w" |
| } |
| |
| whitespace() { |
| echo "generating forward whitespace DFA" |
| ucd-generate dfa \ |
| --name WHITESPACE_ANCHORED_FWD \ |
| --anchored --classes --premultiply --minimize --state-size 1 \ |
| src/unicode/fsm/ \ |
| "\s+" |
| |
| echo "generating reverse whitespace DFA" |
| ucd-generate dfa \ |
| --name WHITESPACE_ANCHORED_REV \ |
| --reverse \ |
| --anchored --classes --premultiply --minimize --state-size 2 \ |
| src/unicode/fsm/ \ |
| "\s+" |
| } |
| |
| main() { |
| if array_exists "-h" "$@" || array_exists "--help" "$@"; then |
| echo "Usage: $(basename "$0") [--list-commands] [<command>] ..." >&2 |
| exit |
| fi |
| |
| commands=" |
| graphemes |
| sentences |
| words |
| regional-indicator |
| simple-word |
| whitespace |
| " |
| if array_exists "--list-commands" "$@"; then |
| for cmd in $commands; do |
| echo "$cmd" |
| done |
| exit |
| fi |
| |
| # ucd-generate is used to compile regexes into DFAs. |
| requires ucd-generate |
| |
| mkdir -p src/unicode/fsm/ |
| |
| cmds=$* |
| if [ $# -eq 0 ] || array_exists "all" "$@"; then |
| cmds=$commands |
| fi |
| for cmd in $cmds; do |
| if array_exists "$cmd" $commands; then |
| fun="$(echo "$cmd" | sed 's/-/_/g')" |
| eval "$fun" |
| else |
| echo "unrecognized command: $cmd" >&2 |
| fi |
| done |
| } |
| |
| main "$@" |