Source-Changes-HG archive

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]

[pkgsrc/trunk]: pkgsrc/textproc/split-thai Update to 0.9



details:   https://anonhg.NetBSD.org/pkgsrc/rev/4b27b379a1cf
branches:  trunk
changeset: 438151:4b27b379a1cf
user:      scole <scole%pkgsrc.org@localhost>
date:      Sat Sep 05 18:02:36 2020 +0000

description:
Update to 0.9
- add a perl script st-wordbreak

diffstat:

 textproc/split-thai/DESCR              |    6 +-
 textproc/split-thai/Makefile           |   13 +-
 textproc/split-thai/PLIST              |    3 +-
 textproc/split-thai/files/README.txt   |   26 +-
 textproc/split-thai/files/st-wordbreak |  284 +++++++++++++++++++++++++++++++++
 5 files changed, 310 insertions(+), 22 deletions(-)

diffs (truncated from 421 to 300 lines):

diff -r 26c21f62e229 -r 4b27b379a1cf textproc/split-thai/DESCR
--- a/textproc/split-thai/DESCR Sat Sep 05 12:28:09 2020 +0000
+++ b/textproc/split-thai/DESCR Sat Sep 05 18:02:36 2020 +0000
@@ -1,7 +1,7 @@
 A collection of utilities to split Thai Unicode UTF-8 text by word
-boundaries, also known as word tokenization.  The utilities use emacs,
-swath, and a c++ icu-project program.  All use dictionary-based word
-splitting.
+boundaries, also known as word tokenization or word breaking.  The
+utilities use emacs, swath, perl, and a c++ icu-project program.  All
+use dictionary-based word splitting.
 
 Also included is a merged dictionary file of Thai words and a perl
 script to grep Thai UTF-8 words.
diff -r 26c21f62e229 -r 4b27b379a1cf textproc/split-thai/Makefile
--- a/textproc/split-thai/Makefile      Sat Sep 05 12:28:09 2020 +0000
+++ b/textproc/split-thai/Makefile      Sat Sep 05 18:02:36 2020 +0000
@@ -1,7 +1,6 @@
-# $NetBSD: Makefile,v 1.9 2020/08/31 18:12:18 wiz Exp $
+# $NetBSD: Makefile,v 1.10 2020/09/05 18:02:36 scole Exp $
 
-PKGNAME=       split-thai-0.8
-PKGREVISION=   1
+PKGNAME=       split-thai-0.9
 CATEGORIES=    textproc
 MAINTAINER=    pkgsrc-users%NetBSD.org@localhost
 COMMENT=       Utilities to split UTF-8 Thai text into words
@@ -21,7 +20,7 @@
 DEPENDS+=      emacs-[0-9]*:../../editors/emacs
 DEPENDS+=      swath-[0-9]*:../../textproc/swath
 
-REPLACE_PERL=  tgrep
+REPLACE_PERL=  st-wordbreak tgrep
 REPLACE_SH=    st-swath
 
 UTF8_ENV=      env LC_ALL=C.UTF-8
@@ -43,14 +42,14 @@
 SUBST_CLASSES+=                        dictionary-app
 SUBST_STAGE.dictionary-app=    pre-configure
 SUBST_MESSAGE.dictionary-app=  Fixing dictionary paths.
-SUBST_FILES.dictionary-app=    st-emacs st-swath
+SUBST_FILES.dictionary-app=    st-emacs st-swath st-wordbreak
 SUBST_SED.dictionary-app=      -e 's,ST_SHARE_DIR,${PREFIX}/${ST_SHARE_DIR},g'
 SUBST_SED.dictionary-app+=     -e 's,ST_SHARE_BIN,${PREFIX}/${ST_SHARE_BIN},g'
 
 pre-extract:
        mkdir -p ${WRKSRC}
        cd files && cp README.txt st-emacs st-icu.cc st-swath \
-               tgrep thai-utility.el thaidict.abm ${WRKSRC}
+               st-wordbreak tgrep thai-utility.el thaidict.abm ${WRKSRC}
 
 post-extract:
        cd ${WRKSRC} && ${UTF8_ENV} emacs --batch \
@@ -83,7 +82,7 @@
 
 do-install:
        ${INSTALL_SCRIPT} ${WRKSRC}/st-emacs ${WRKSRC}/st-swath \
-               ${WRKSRC}/tgrep ${DESTDIR}${PREFIX}/bin
+               ${WRKSRC}/st-wordbreak ${WRKSRC}/tgrep ${DESTDIR}${PREFIX}/bin
        ${INSTALL_PROGRAM} ${WRKSRC}/st-icu ${DESTDIR}${PREFIX}/bin
 .for i in ${ST_SHARE_FILES}
        ${INSTALL_DATA} ${WRKSRC}/${i} ${DESTDIR}${PREFIX}/share/split-thai
diff -r 26c21f62e229 -r 4b27b379a1cf textproc/split-thai/PLIST
--- a/textproc/split-thai/PLIST Sat Sep 05 12:28:09 2020 +0000
+++ b/textproc/split-thai/PLIST Sat Sep 05 18:02:36 2020 +0000
@@ -1,7 +1,8 @@
-@comment $NetBSD: PLIST,v 1.3 2020/08/28 16:02:42 scole Exp $
+@comment $NetBSD: PLIST,v 1.4 2020/09/05 18:02:36 scole Exp $
 bin/st-emacs
 bin/st-icu
 bin/st-swath
+bin/st-wordbreak
 bin/tgrep
 share/split-thai/README.txt
 share/split-thai/thai-dict.el
diff -r 26c21f62e229 -r 4b27b379a1cf textproc/split-thai/files/README.txt
--- a/textproc/split-thai/files/README.txt      Sat Sep 05 12:28:09 2020 +0000
+++ b/textproc/split-thai/files/README.txt      Sat Sep 05 18:02:36 2020 +0000
@@ -2,10 +2,11 @@
      st-emacs
      st-icu
      st-swath
+     st-wordbreak
      tgrep
 
 SYNOPSIS
-     st-emacs|st-icu|st-swath [filename|text1 text2 ...|'blank']
+     st-emacs|st-icu|st-swath|st-wordbreak [filename|text1 text2 ...|'blank']
      tgrep [options] FILE ...
 
 DESCRIPTION
@@ -13,14 +14,16 @@
      by spaces (word tokenization).  They can separate stdin, files,
      or text as arguments.  It includes these utilities:
 
-     st-emacs:  emacs-script using emacs lisp thai-word library
-                https://www.gnu.org/software/emacs/
-     st-icu:    basic C++ program using the ICU library
-                http://site.icu-project.org/
-     st-swath:  sh script wrapper to simplfy args to the swath program
-                https://linux.thai.net/projects/swath
+     st-emacs:     emacs-script using emacs lisp thai-word library
+                   https://www.gnu.org/software/emacs/
+     st-icu:       basic C++ program using the ICU library
+                   http://site.icu-project.org/
+     st-swath:     sh script wrapper to simplfy args to the swath program
+                   https://linux.thai.net/projects/swath
+     st-wordbreak: perl script to brute-force separate thai words,
+                   see "st-wordbreak -h"
 
-     tgrep:     grep-like utility using perl, see "tgrep -h"
+     tgrep:        grep-like utility using perl, see "tgrep -h"
 
 EXAMPLES
       split one or more text strings:
@@ -62,9 +65,9 @@
      and corresponding .tri file, and emacs lisp .el files for reading
      and dumping out dictionary files.
 
-     st-emacs and st-swath are setup to use the combined dictionary
-     with words from the emacs 'thai-word library, swath dictionary
-     words, and the icu thai library words.
+     st-emacs, st-swath, and st-wordbreak are setup to use the
+     combined dictionary with words from the emacs 'thai-word library,
+     swath dictionary words, and the icu thai library words.
 
      st-icu uses its own built in library.  To customise the icu
      dictionary, you apparently would have to modify
@@ -79,3 +82,4 @@
      thai text mixed with other languages may not be handled well when
      splitting.
      this file should be converted to proper manpages.
+     these utilities need better names.
diff -r 26c21f62e229 -r 4b27b379a1cf textproc/split-thai/files/st-wordbreak
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/textproc/split-thai/files/st-wordbreak    Sat Sep 05 18:02:36 2020 +0000
@@ -0,0 +1,284 @@
+#!/bin/perl
+#
+# split thai text or file into longest (or smallest) possible words
+# based on words in a dictionary file. 
+#
+use strict;
+use warnings;
+use Encode;
+use Getopt::Std;
+
+use utf8;
+use open qw/:std :utf8/;
+
+our ($opt_a, $opt_d, $opt_h, $opt_l, $opt_s);
+
+# global dictionary of thai words
+my %tdict = ();
+
+# tmp results for recursions
+my @tmp_combos = ();
+
+#
+# read thai dictionary into hash.  each line in dictionary should be a
+# thai word. returns number of words read.
+#
+sub read_dictionary {
+    # default dictionary
+    my $fname = "ST_SHARE_DIR/thaidict";
+    $fname = $opt_d if defined $opt_d;
+
+    die "error reading dictionary $fname" unless -f $fname && -s $fname;
+    open(my $fh, '<', $fname) or die "error opening $fname: $!";
+    while ( <$fh> ) {
+       chomp;
+       $tdict{$_} = 1;
+    }
+    close $fh or die "error closing $fname : $!";
+
+    # add punctuation and misc so parser won't choke
+    my @misc_words =
+       ( "~", "`", "!", "@", "#", "\$", "%", "^", "&", "*", "(", ")",
+         "-", "_", "=", "+", "\\", "|", "{", "}", "[", "]",
+         ";", ":", "'", "\"", "<", ">", ".", ",", "/", "?",
+         "ๆ", "ฯาฯ", "ฯ", "฿", "๏", "๚", "๛" );
+
+    foreach ( @misc_words ) {
+       $tdict{$_} = 1;
+    }
+    
+    my $size = keys %tdict;
+    return $size;
+}
+
+#
+# find all possible word breaks for a string,
+# returns an array of strings
+#
+sub word_break {
+    my $str = shift @_;
+
+    # return string as is unless it contains thai
+    return ( $str ) unless $str =~ m/\p{InThai}/;
+    
+    # add numbers to dictionary entries so numbers will be parsed
+    my @numbers = ( $str =~ m/([0123456789๑๒๓๔๕๖๗๘๙๐,.]+)/g );
+    foreach ( @numbers ) {
+       $tdict{$_} = 1;
+    }
+
+    # add any english looking words 
+    my @eng_words = ( $str =~ m/([a-zA-Z]+)/g );
+    foreach ( @eng_words ) {
+       $tdict{$_} = 1;
+    }
+
+    # clear any whitespace
+    $str =~ s/\s+//g;
+    
+    # clear any previous word breaks
+    @tmp_combos = ();
+    word_break_util( $str, length($str), "");
+
+    # filter results depending on args
+    my @filtered = filter_combos( @tmp_combos );
+    return @filtered;
+}
+
+#
+# recursive function to find all possible word combos of string based
+# on words in dictionary.  adapted from
+# https://www.geeksforgeeks.org/word-break-problem-using-backtracking
+#
+# saves combos to global list
+#
+sub word_break_util{
+    my $str = shift @_;
+    my $n = shift @_;
+    my $result = shift @_;
+
+    for ( my $i=1; $i <= $n; $i++ ) {
+       my $prefix = substr($str, 0, $i);
+
+       if ( exists $tdict{$prefix} ) {
+           if ( $i == $n ) {
+               $result .= $prefix;
+               #print $result, "\n";
+               push @tmp_combos, $result;
+               return;
+           }
+           word_break_util( substr($str, $i, $n-$i),
+                            $n-$i,$result . $prefix . " ");
+       }
+    }
+}
+
+#
+# filter results depending on input args
+#
+sub filter_combos {
+    my @swords = @_;
+    
+    # do nothing if keeping all or no words given
+    if ( ! @swords ||
+        $opt_a && ( ! $opt_l && ! $opt_s ) ) {
+       return @swords;
+    }
+
+    # get count of words for each str, make hash str => word-count
+    my %combos = map { $_ => scalar( split / /, $_ ) } @swords;
+
+    # sort by least or most words
+    my @scombos = ();
+    foreach my $str ( sort
+                     { $opt_s ? $combos{$b} <=> $combos{$a} :
+                           $combos{$a} <=> $combos{$b} } keys %combos ) {
+       push @scombos, $str;
+    }
+
+    # only one permutation found, not listing more than one permutation,
+    # or only one "best" match
+    if ( scalar ( @scombos ) < 2 ||
+        ! $opt_a ||
+        $combos{$scombos[0]} != $combos{$scombos[1]} ) {
+       return ( $scombos[0] );
+    }
+
+    # more than one best match and want them listed, so return "best" matches
+    my $mark = $combos{$scombos[0]};
+    my @best_set = ();
+    foreach ( @scombos ) {
+       if ( $combos{$_} == $mark ) {
+           push @best_set, $_;
+       } else {
+           last;
+       }
+    }
+    die "could not find best set" unless @best_set;
+    
+    return @best_set;
+}
+
+#
+# word break a file line-by-line, print to stdout,


Home | Main Index | Thread Index | Old Index