CVS commit: pkgsrc/textproc/split-thai

To: pkgsrc-changes%NetBSD.org@localhost
Subject: CVS commit: pkgsrc/textproc/split-thai
From: "Sean Cole" <scole%netbsd.org@localhost>
Date: Mon, 17 Aug 2020 17:43:15 +0000

Module Name:    pkgsrc
Committed By:   scole
Date:           Mon Aug 17 17:43:15 UTC 2020

Modified Files:
        pkgsrc/textproc/split-thai: Makefile
        pkgsrc/textproc/split-thai/files: README.txt st-emacs st-icu.cc
            st-swath thai-utility.el

Log Message:
Update to 0.4
- always use pkgsrc path for swath for st-swath script
- make splitting of numbers a little more consistent for st-emacs & st-icu
- add split-thai, split-thai-line, wrapper functions to emacs lisp code


To generate a diff of this commit:
cvs rdiff -u -r1.3 -r1.4 pkgsrc/textproc/split-thai/Makefile
cvs rdiff -u -r1.2 -r1.3 pkgsrc/textproc/split-thai/files/README.txt
cvs rdiff -u -r1.3 -r1.4 pkgsrc/textproc/split-thai/files/st-emacs \
    pkgsrc/textproc/split-thai/files/thai-utility.el
cvs rdiff -u -r1.1 -r1.2 pkgsrc/textproc/split-thai/files/st-icu.cc \
    pkgsrc/textproc/split-thai/files/st-swath

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.

Modified files:

Index: pkgsrc/textproc/split-thai/Makefile
diff -u pkgsrc/textproc/split-thai/Makefile:1.3 pkgsrc/textproc/split-thai/Makefile:1.4
--- pkgsrc/textproc/split-thai/Makefile:1.3     Sat Aug 15 16:52:28 2020
+++ pkgsrc/textproc/split-thai/Makefile Mon Aug 17 17:43:15 2020
@@ -1,6 +1,6 @@
-# $NetBSD: Makefile,v 1.3 2020/08/15 16:52:28 scole Exp $
+# $NetBSD: Makefile,v 1.4 2020/08/17 17:43:15 scole Exp $
 
-PKGNAME=       split-thai-0.3
+PKGNAME=       split-thai-0.4
 CATEGORIES=    textproc
 MAINTAINER=    pkgsrc-users%NetBSD.org@localhost
 COMMENT=       Utilities to split UTF-8 Thai text into words
@@ -24,7 +24,8 @@ REPLACE_SH=   st-swath
 UTF8_ENV=      env LC_ALL=C.UTF-8
 
 ST_SHARE_DIR=          share/split-thai
-INSTALLATION_DIRS=     bin ${ST_SHARE_DIR}
+ST_SHARE_BIN=          bin
+INSTALLATION_DIRS=     ${ST_SHARE_BIN} ${ST_SHARE_DIR}
 
 ST_SHARE_FILES=                README.txt thaidict thai-dict.el thai-dict.elc
 ST_SHARE_FILES+=       thai-utility.el thai-utility.elc thaidict.tri
@@ -41,6 +42,7 @@ SUBST_STAGE.dictionary-app=   pre-configur
 SUBST_MESSAGE.dictionary-app=  Fixing dictionary paths.
 SUBST_FILES.dictionary-app=    st-emacs st-swath
 SUBST_SED.dictionary-app=      -e 's,ST_SHARE_DIR,${PREFIX}/${ST_SHARE_DIR},g'
+SUBST_SED.dictionary-app+=     -e 's,ST_SHARE_BIN,${PREFIX}/${ST_SHARE_BIN},g'
 
 pre-extract:
        mkdir -p ${WRKSRC}

Index: pkgsrc/textproc/split-thai/files/README.txt
diff -u pkgsrc/textproc/split-thai/files/README.txt:1.2 pkgsrc/textproc/split-thai/files/README.txt:1.3
--- pkgsrc/textproc/split-thai/files/README.txt:1.2     Fri Aug 14 17:31:34 2020
+++ pkgsrc/textproc/split-thai/files/README.txt Mon Aug 17 17:43:15 2020
@@ -66,5 +66,5 @@ SEE ALSO
 
 BUGS
      st-icu should also use the combined dictionary words.
-     st-emacs and st-icu don't always split thai numbers well.
+     thai text mixed with other languages may not be handled well.
      this file should be converted to a proper manpage.

Index: pkgsrc/textproc/split-thai/files/st-emacs
diff -u pkgsrc/textproc/split-thai/files/st-emacs:1.3 pkgsrc/textproc/split-thai/files/st-emacs:1.4
--- pkgsrc/textproc/split-thai/files/st-emacs:1.3       Sat Aug 15 16:52:29 2020
+++ pkgsrc/textproc/split-thai/files/st-emacs   Mon Aug 17 17:43:15 2020
@@ -18,7 +18,7 @@
   (with-temp-buffer
     (insert line)
     (goto-char (point-min))
-    (thai-break-words " ")
+    (split-thai-line)
     (buffer-string)))
 
 ;; hack to process stdin
@@ -48,6 +48,6 @@
     (insert (mapconcat 'identity (cdddr command-line-args) " "))
     (insert "\n"))
   (goto-char (point-min))
-  (thai-break-words " ")
+  (split-thai)
   (write-region nil nil "/dev/stdout"))
 (kill-emacs 0)
Index: pkgsrc/textproc/split-thai/files/thai-utility.el
diff -u pkgsrc/textproc/split-thai/files/thai-utility.el:1.3 pkgsrc/textproc/split-thai/files/thai-utility.el:1.4
--- pkgsrc/textproc/split-thai/files/thai-utility.el:1.3        Sat Aug 15 16:52:29 2020
+++ pkgsrc/textproc/split-thai/files/thai-utility.el    Mon Aug 17 17:43:15 2020
@@ -168,15 +168,38 @@ dictionary words."
       (write-region nil nil lispfile))
     line_count))
 
-(defun split-thai-line(&optional separator)
+(defun split-thai-line()
   "Break Thai words from point to end of line by inserting a
 separator string at word boundaries. (wrapper for 'thai-break-words)"
   (interactive)
-    (thai-break-words (or separator " ") (line-end-position)))
+  (thai-break-words " " (line-end-position))
+  (split-thai-numbers (point) (line-end-position)))
 
-(defun split-thai(&optional separator)
+(defun split-thai()
   "Break Thai words from point to end of buffer by inserting a
 separator string at word boundaries. (wrapper for
 'thai-break-words)"
   (interactive)
-    (thai-break-words (or separator " ") (point-max)))
+  (thai-break-words " " (point-max))
+  (split-thai-numbers (point) (point-max)))
+
+(defun split-thai-numbers(start_point end_point)
+  "helper function to separate numbers in a buffer.
+'thai-break-words doesn't always split numbers properly. this may
+improve tokenization somewhat."
+  ;; xxx this really should be fixed in 'thai-word lib
+  (let* (
+        ;; "\\([๐๑๒๓๔๕๖๗๘๙0123456789]+\\)"
+        (num_rexp "\\([\u0e50-\u0e59]+\\)") ;; thai numbers
+        (nonnum_rexp "\\([\u0e00-\u0e4f\u0e5a-\u0e7f]\\)") ;; "non-numbers"
+        (trailing_rexp (concat num_rexp nonnum_rexp))
+        (leading_rexp (concat nonnum_rexp num_rexp)))
+    (save-restriction
+      (narrow-to-region start_point end_point)
+      (goto-char (point-min))
+      (while (search-forward-regexp trailing_rexp nil t)
+       (replace-match (concat (match-string 1) " " (match-string 2))))
+      (goto-char (point-min))
+      (while (search-forward-regexp leading_rexp nil t)
+       (replace-match (concat (match-string 1) " " (match-string 2))))
+      (goto-char start_point))))

Index: pkgsrc/textproc/split-thai/files/st-icu.cc
diff -u pkgsrc/textproc/split-thai/files/st-icu.cc:1.1 pkgsrc/textproc/split-thai/files/st-icu.cc:1.2
--- pkgsrc/textproc/split-thai/files/st-icu.cc:1.1      Thu Aug 13 20:52:09 2020
+++ pkgsrc/textproc/split-thai/files/st-icu.cc  Mon Aug 17 17:43:15 2020
@@ -13,6 +13,13 @@
 using namespace std;
 using namespace icu;
 
+// utf-8 unicode thai values
+// 0x0e1 - 0x0e5b should work for thai_rexp as well...
+const UnicodeString thai_rexp = "[\\u0e00-\\u0e7f]+";
+const UnicodeString thai_consonant = "[\\u0e01-\\u0e2e]+";
+const UnicodeString thai_num_rexp = "[\\u0e50-\\u0e59]+";
+const UnicodeString thai_nonnum_rexp = "[\\u0e01-\\u0e4f\\u0e5a-\\u0e7f]+";
+
 void usage() {
  const char *progname = "st-icu";
        
@@ -27,11 +34,11 @@ void usage() {
      "returns 0 on succes, or non-zero otherwise" << endl << endl;
 }
 
-// return true if string contains any thai unicode
-bool contains_thai(const UnicodeString &s) {
+// return true if string contains some regexp
+bool matches_regexp(const UnicodeString &s, const UnicodeString &regexp) {
        UErrorCode status = U_ZERO_ERROR;
-       // matches one or more thai chars, \u0e01-\u0e5b should work too
-       RegexMatcher *matcher = new RegexMatcher("[\u0e00-\u0e7f]+", 0, status);
+
+       RegexMatcher *matcher = new RegexMatcher(regexp, 0, status);
 
        if (U_FAILURE(status)) {
                // syntax errors in the regular expression
@@ -46,11 +53,36 @@ bool contains_thai(const UnicodeString &
                return false;
 }
 
+// add spaces to string with thai numbers
+UnicodeString space_thai_numbers(const UnicodeString &s) {
+       // return string unmodified if no numbers
+       if ( ! matches_regexp(s, thai_num_rexp) ) {
+               return s;
+       }
+
+       UnicodeString rs;
+       UChar32 pch;
+       // add spaces between number and non-number
+       for (int i = 0 ; i < s.length(); i++) {
+               if ( u_isWhitespace(s[i]) ) {
+                       rs += s[i];
+               } else if ((u_isdigit(s[i]) && !u_isdigit(pch) && matches_regexp(pch, thai_rexp)) ||
+                          (u_isdigit(pch) && !u_isdigit(s[i]) && matches_regexp(s[i], thai_rexp))) {
+                       rs += " ";
+                       rs += s[i];
+               } else {
+                       rs += s[i];
+               }
+               pch = s[i];
+       }
+       return rs;
+}
+
 // split a unicode string by word boundaries.  if arg contains
 // whitespaces, it will get consolidated to single spaces.
 // if string has no thai characters, return it unmodified
 UnicodeString split_words_consolidated(const UnicodeString &s) {
-       if ( ! contains_thai(s) ) {
+       if ( ! matches_regexp(s, thai_rexp) ) {
                return s;
        }
        
@@ -108,6 +140,8 @@ UnicodeString split_words(const UnicodeS
        }
        if ( tempStr.length() > 0 )
                rs += split_words_consolidated(tempStr);
+
+       rs = space_thai_numbers(rs);
        return rs;
 }
 
Index: pkgsrc/textproc/split-thai/files/st-swath
diff -u pkgsrc/textproc/split-thai/files/st-swath:1.1 pkgsrc/textproc/split-thai/files/st-swath:1.2
--- pkgsrc/textproc/split-thai/files/st-swath:1.1       Thu Aug 13 20:52:09 2020
+++ pkgsrc/textproc/split-thai/files/st-swath   Mon Aug 17 17:43:15 2020
@@ -6,6 +6,7 @@
 # swath settings are split with ' ', longest match, unicode input, and
 # unicode output.  see swath(1)
 #
+swath_cmd=ST_SHARE_BIN/swath
 
 # use merged dictionary unless specified otherwise
 if [ -z "$SWATHDICT" ]; then
@@ -16,12 +17,12 @@ if [ "$#" -eq 0 ]; then
     # no args, read from stdin
     while read line
     do
-       echo "$line" | swath -b ' ' -m long -u 'u,u' $dictarg
+       echo "$line" | $swath_cmd -b ' ' -m long -u 'u,u' $dictarg
     done < /dev/stdin
     exit 0 
 elif [ "$#" -eq 1 -a -e "$1" ]; then
     # one arg and arg is an existing file
-    swath -b ' ' -m long -u 'u,u' $dictarg < "$1"
+    $swath_cmd -b ' ' -m long -u 'u,u' $dictarg < "$1"
     exit $?
 elif [ "$#" -ge 1 ]; then
     # one or more args, assume it is all text
@@ -34,7 +35,7 @@ elif [ "$#" -ge 1 ]; then
 
        shift
     done
-    echo "$txt" | swath -b ' ' -m long -u 'u,u' $dictarg
+    echo "$txt" | $swath_cmd -b ' ' -m long -u 'u,u' $dictarg
     exit $?
 else
     echo "$0: error parsing args"

Prev by Date: CVS commit: pkgsrc/fonts/fontconfig
Next by Date: CVS commit: pkgsrc/doc
Previous by Thread: CVS commit: pkgsrc/fonts/fontconfig
Next by Thread: CVS commit: pkgsrc/doc
Indexes:

Home | Main Index | Thread Index | Old Index