pkgsrc-Changes archive

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]

CVS commit: pkgsrc/textproc



Module Name:    pkgsrc
Committed By:   scole
Date:           Thu Aug 13 20:52:09 UTC 2020

Modified Files:
        pkgsrc/textproc: Makefile
Added Files:
        pkgsrc/textproc/split-thai: DESCR Makefile PLIST distinfo
        pkgsrc/textproc/split-thai/files: README.txt st-emacs st-icu.cc
            st-swath thai-utility.el thaidict.abm

Log Message:
Add split-thai 0.1, a set of utilities for splitting Thai UTF8 text by word boundaries


To generate a diff of this commit:
cvs rdiff -u -r1.1164 -r1.1165 pkgsrc/textproc/Makefile
cvs rdiff -u -r0 -r1.1 pkgsrc/textproc/split-thai/DESCR \
    pkgsrc/textproc/split-thai/Makefile pkgsrc/textproc/split-thai/PLIST \
    pkgsrc/textproc/split-thai/distinfo
cvs rdiff -u -r0 -r1.1 pkgsrc/textproc/split-thai/files/README.txt \
    pkgsrc/textproc/split-thai/files/st-emacs \
    pkgsrc/textproc/split-thai/files/st-icu.cc \
    pkgsrc/textproc/split-thai/files/st-swath \
    pkgsrc/textproc/split-thai/files/thai-utility.el \
    pkgsrc/textproc/split-thai/files/thaidict.abm

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.

Modified files:

Index: pkgsrc/textproc/Makefile
diff -u pkgsrc/textproc/Makefile:1.1164 pkgsrc/textproc/Makefile:1.1165
--- pkgsrc/textproc/Makefile:1.1164     Fri Aug  7 02:36:24 2020
+++ pkgsrc/textproc/Makefile    Thu Aug 13 20:52:08 2020
@@ -1,4 +1,4 @@
-# $NetBSD: Makefile,v 1.1164 2020/08/07 02:36:24 brook Exp $
+# $NetBSD: Makefile,v 1.1165 2020/08/13 20:52:08 scole Exp $
 #
 
 COMMENT=       Text processing utilities (does not include desktop publishing)
@@ -1099,6 +1099,7 @@ SUBDIR+=  soprano
 SUBDIR+=       sord
 SUBDIR+=       source-highlight
 SUBDIR+=       sphinxsearch
+SUBDIR+=       split-thai
 SUBDIR+=       stardic
 SUBDIR+=       sub2srt
 SUBDIR+=       sublib

Added files:

Index: pkgsrc/textproc/split-thai/DESCR
diff -u /dev/null pkgsrc/textproc/split-thai/DESCR:1.1
--- /dev/null   Thu Aug 13 20:52:09 2020
+++ pkgsrc/textproc/split-thai/DESCR    Thu Aug 13 20:52:09 2020
@@ -0,0 +1,6 @@
+A collection of utilities to split Thai Unicode UTF-8 text by word
+boundaries, also known as word tokenization.  The utilities use emacs,
+swath, and a c++ icu-project program.  All use dictionary-based word
+splitting.
+
+Also included is merged dictionary file of thai words.
Index: pkgsrc/textproc/split-thai/Makefile
diff -u /dev/null pkgsrc/textproc/split-thai/Makefile:1.1
--- /dev/null   Thu Aug 13 20:52:09 2020
+++ pkgsrc/textproc/split-thai/Makefile Thu Aug 13 20:52:09 2020
@@ -0,0 +1,81 @@
+# $NetBSD: Makefile,v 1.1 2020/08/13 20:52:09 scole Exp $
+
+PKGNAME=       split-thai-0.1
+CATEGORIES=    textproc
+MAINTAINER=    pkgsrc-users%NetBSD.org@localhost
+COMMENT=       Utilities to split UTF-8 Thai text into words
+LICENSE=       public-domain AND mit AND gnu-gpl-v2 # code, icu dict, swath dict
+
+# xxx fetching a specific version of a file out of a github project
+EXTRACT_SUFX=  # none
+GITHUB_ICU_TAG=        61607c27732906d36c5bd4d23ecc092f89f53a2b
+DISTFILES=     thaidict-${GITHUB_ICU_TAG}.txt
+MASTER_SITES=  -${MASTER_SITE_GITHUB:=unicode-org/}/icu/raw/${GITHUB_ICU_TAG}/icu4c/source/data/brkitr/dictionaries/thaidict.txt
+
+USE_LANGUAGES= c++11   # darwin needed 11?
+
+USE_TOOLS=     pkg-config mkdir cp sh:run env awk cat sort uniq grep wc echo
+BUILD_DEPENDS+=        libdatrie-[0-9]*:../../devel/libdatrie
+DEPENDS+=      emacs-[0-9]*:../../editors/emacs
+DEPENDS+=      swath-[0-9]*:../../textproc/swath
+
+REPLACE_SH=    st-swath
+
+UTF8_ENV=      env LC_ALL=C.UTF-8
+
+ST_SHARE_DIR=          share/split-thai
+INSTALLATION_DIRS=     bin ${ST_SHARE_DIR}
+
+# xxx REPLACE_EMACS_SCRIPT
+SUBST_CLASSES+=                        st-emacs-app
+SUBST_STAGE.st-emacs-app=      pre-configure
+SUBST_MESSAGE.st-emacs-app=    Fixing emacs script paths.
+SUBST_FILES.st-emacs-app=      st-emacs
+SUBST_SED.st-emacs-app=                -e 's,!/bin/emacs,!${PREFIX}/bin/emacs,g'
+
+SUBST_CLASSES+=                        dictionary-app
+SUBST_STAGE.dictionary-app=    pre-configure
+SUBST_MESSAGE.dictionary-app=  Fixing dictionary paths.
+SUBST_FILES.dictionary-app=    st-emacs st-swath
+SUBST_SED.dictionary-app=      -e 's,ST_SHARE_DIR,${PREFIX}/${ST_SHARE_DIR},g'
+
+pre-extract:
+       mkdir -p ${WRKSRC}
+       cd files && cp README.txt st-emacs st-icu.cc st-swath \
+               thai-utility.el thaidict.abm ${WRKSRC}
+
+post-extract:
+       cd ${WRKSRC} && ${UTF8_ENV} emacs --batch \
+               -f batch-byte-compile thai-utility.el
+       cd ${WRKSRC} && ${UTF8_ENV} emacs --batch -l thai-utility.el \
+               --eval '(thai-word-table-save "emacs-dict")'
+       cp ${WRKDIR}/${DISTFILES} ${WRKSRC}/icu-dict
+       cd ${PREFIX}/share/swath && \
+               ${UTF8_ENV} trietool swathdic list | \
+               awk '{print $$1}' > ${WRKSRC}/swath-dict
+       cd ${WRKSRC} && \
+               ${UTF8_ENV} cat icu-dict swath-dict emacs-dict | \
+                       grep -v '#' | sort | uniq > thaidict
+       cd ${WRKSRC} && \
+               ${UTF8_ENV} trietool thaidict add-list -e utf-8 thaidict
+.for i in emacs-dict icu-dict swath-dict
+       @${ECHO} `wc -l ${WRKSRC}/${i} | awk '{print $$1}'` words in ${i}
+.endfor
+       @${ECHO} `wc -l ${WRKSRC}/thaidict | awk '{print $$1}'` \
+               unique words in combined dictionary
+
+do-build:
+       cd ${WRKSRC} && \
+               ${CXX} ${CPPFLAGS} -o st-icu st-icu.cc \
+               `pkg-config --libs --cflags icu-io`
+
+do-install:
+       ${INSTALL_SCRIPT} ${WRKSRC}/st-emacs ${WRKSRC}/st-swath \
+               ${DESTDIR}${PREFIX}/bin
+       ${INSTALL_PROGRAM} ${WRKSRC}/st-icu ${DESTDIR}${PREFIX}/bin
+.for i in README.txt thaidict thai-utility.el thai-utility.elc thaidict.tri
+       ${INSTALL_DATA} ${WRKSRC}/${i} ${DESTDIR}${PREFIX}/share/split-thai
+.endfor
+
+.include "../../textproc/icu/buildlink3.mk"
+.include "../../mk/bsd.pkg.mk"
Index: pkgsrc/textproc/split-thai/PLIST
diff -u /dev/null pkgsrc/textproc/split-thai/PLIST:1.1
--- /dev/null   Thu Aug 13 20:52:09 2020
+++ pkgsrc/textproc/split-thai/PLIST    Thu Aug 13 20:52:09 2020
@@ -0,0 +1,9 @@
+@comment $NetBSD: PLIST,v 1.1 2020/08/13 20:52:09 scole Exp $
+bin/st-emacs
+bin/st-icu
+bin/st-swath
+share/split-thai/README.txt
+share/split-thai/thai-utility.el
+share/split-thai/thai-utility.elc
+share/split-thai/thaidict
+share/split-thai/thaidict.tri
Index: pkgsrc/textproc/split-thai/distinfo
diff -u /dev/null pkgsrc/textproc/split-thai/distinfo:1.1
--- /dev/null   Thu Aug 13 20:52:09 2020
+++ pkgsrc/textproc/split-thai/distinfo Thu Aug 13 20:52:09 2020
@@ -0,0 +1,6 @@
+$NetBSD: distinfo,v 1.1 2020/08/13 20:52:09 scole Exp $
+
+SHA1 (thaidict-61607c27732906d36c5bd4d23ecc092f89f53a2b.txt) = 2a2ad127cc279835cb4df04eb69401a0d4927774
+RMD160 (thaidict-61607c27732906d36c5bd4d23ecc092f89f53a2b.txt) = 0a6df7b7dd6ef502c5dd20020e37b2ca1a5514a2
+SHA512 (thaidict-61607c27732906d36c5bd4d23ecc092f89f53a2b.txt) = 88800fe2a453fc40f16ff54c21c852a8ea8e1496e42d5d187e5b5ac0ff58050830fc0816239e4f88cb23ed301f894d1ca52eb4676fd85c13c285cec815ae7c42
+Size (thaidict-61607c27732906d36c5bd4d23ecc092f89f53a2b.txt) = 493044 bytes

Index: pkgsrc/textproc/split-thai/files/README.txt
diff -u /dev/null pkgsrc/textproc/split-thai/files/README.txt:1.1
--- /dev/null   Thu Aug 13 20:52:09 2020
+++ pkgsrc/textproc/split-thai/files/README.txt Thu Aug 13 20:52:09 2020
@@ -0,0 +1,49 @@
+This is a collection of utilities to separate Thai words by spaces
+(word tokenization).  They can separate stdin, files, or text as
+arguments.  It includes 3 separate utilities:
+
+st-emacs:  emacs-script using emacs lisp thai-word library
+           https://www.gnu.org/software/emacs/
+st-icu:    basic C++ program using the ICU library
+           http://site.icu-project.org/
+st-swath:  sh script wrapper to simplfy args to the swath program
+           https://linux.thai.net/projects/swath
+
+All scripts should be able to take a filename, stdin, or arguments as
+input, e.g., :
+
+      # st-swath แมวและหมา
+or
+      # echo "แมวและหมา" | st-swath
+or      
+      # st-swath < thaifile.txt
+or
+      # st-swath "แมวหมา" พ่อและแม่
+      
+You will most likely need to set LC_ALL or LC_CTYPE to an approriate
+unicode value, e.g., en_US.UTF-8 or C.UTF-8, in the environment for
+them to work properly.  These tools are setup to only support UTF-8
+encodings.
+
+Note that it is not possible to split Thai words 100% accurately
+without context and meaning.  These programs use dictionary-based word
+splitting.
+
+Also included in the package is a combined thai word dictionary and
+corresponding .tri file, and emacs lisp .el file for reading and
+dumping out dictionary files.
+
+st-emacs and st-swath are setup to use the combined dictionary with
+words from the emacs 'thai-word library, swath dictionary words, and
+the icu thai library words.
+
+st-icu uses its own built in library.  To customise the icu
+dictionary, you apparently would have to modify
+  icu4c/source/data/brkitr/dictionaries/thaidict.txt
+and rebuild icu library, and then rebuild the whole thing.
+
+There is also 
+
+See also swath(1), libthai(1), emacs(1), locale(1), uconv(1), iconv(1)
+
+TODO - fix st-icu to use all the combined dictionary words.
Index: pkgsrc/textproc/split-thai/files/st-emacs
diff -u /dev/null pkgsrc/textproc/split-thai/files/st-emacs:1.1
--- /dev/null   Thu Aug 13 20:52:09 2020
+++ pkgsrc/textproc/split-thai/files/st-emacs   Thu Aug 13 20:52:09 2020
@@ -0,0 +1,54 @@
+#!/bin/emacs --script
+;;
+;; break thai string into words separated by spaces
+;;
+;; - if no args, process stdin
+;; - if one arg and file exists with arg name, process file
+;; - else join get remainder of args and process
+;;
+
+;;(toggle-debug-on-error) ;; debug
+(require 'thai-word)
+
+;; load custom dictionary
+(load "ST_SHARE_DIR/thai-utility" nil t)
+(thai-update-word-table-utf8 "ST_SHARE_DIR/thaidict")
+
+;; split a thai line by spaces, return new line
+(defun process-thai-line(line)
+  (with-temp-buffer
+    (insert line)
+    (goto-char (point-min))
+    (thai-break-words " ")
+    (buffer-string)))
+
+;; hack to process stdin
+(defun process-stdin()
+  (condition-case nil
+      (let (aline)
+       (while (setq aline (read-from-minibuffer ""))
+         (princ (process-thai-line aline))
+         (princ "\n")))
+    (error nil)))
+
+;; process arguments, remove "emacs -scriptload scriptname" from args,
+;; join the rest by spaces
+(setq args (cdddr command-line-args))
+(setq argc (length args))
+
+;; no args => process stdin
+(when (= 0 argc)
+  (process-stdin)
+  (kill-emacs 0))
+
+;; if one arg and arg is a file, process that file
+;; else process all input args joined by spaces with an added newline
+(with-temp-buffer
+  (if (and (= 1 argc) (file-exists-p (car args)))
+      (insert-file-contents (car args))
+    (insert (mapconcat 'identity (cdddr command-line-args) " "))
+    (insert "\n"))
+  (goto-char (point-min))
+  (thai-break-words " ")
+  (write-region nil nil "/dev/stdout"))
+(kill-emacs 0)
Index: pkgsrc/textproc/split-thai/files/st-icu.cc
diff -u /dev/null pkgsrc/textproc/split-thai/files/st-icu.cc:1.1
--- /dev/null   Thu Aug 13 20:52:09 2020
+++ pkgsrc/textproc/split-thai/files/st-icu.cc  Thu Aug 13 20:52:09 2020
@@ -0,0 +1,195 @@
+/*
+ *   split up thai strings in a file, stdin or args into "words"
+ */
+#include <fstream>
+#include <vector>
+
+#include <unicode/brkiter.h>
+#include <unicode/regex.h>
+#include <unicode/ucnv.h>
+#include <unicode/ustream.h>
+#include <unicode/ustdio.h>
+
+using namespace std;
+using namespace icu;
+
+void usage() {
+ const char *progname = "st-icu";
+       
+ cout << endl <<
+  "Usage: " << progname << " [stdin|filename|thaiarg1 thaiarg2 ...]" <<
+    endl << endl <<
+     "This program attempts to split thai strings into thai words." << endl <<
+     "It takes a filename, stdin, or UTF8 thai string(s) as arguments" << endl <<
+     "and prints out the string separated by spaces." << endl <<
+     "When no argument is given, it can read lines from stdin, and" << endl <<
+     "separate thai words in the line by spaces." << endl << endl <<
+     "returns 0 on succes, or non-zero otherwise" << endl << endl;
+}
+
+// return true if string contains any thai unicode
+bool contains_thai(const UnicodeString &s) {
+       UErrorCode status = U_ZERO_ERROR;
+       // matches one or more thai chars, \u0e01-\u0e5b should work too
+       RegexMatcher *matcher = new RegexMatcher("[\u0e00-\u0e7f]+", 0, status);
+
+       if (U_FAILURE(status)) {
+               // syntax errors in the regular expression
+               cerr << "error creating RegexMatcher" << endl;
+               exit(1);
+       }
+
+       matcher->reset(s);
+       if (matcher->find())
+               return true;
+       else
+               return false;
+}
+
+// split a unicode string by word boundaries.  if arg contains
+// whitespaces, it will get consolidated to single spaces.
+// if string has no thai characters, return it unmodified
+UnicodeString split_words_consolidated(const UnicodeString &s) {
+       if ( ! contains_thai(s) ) {
+               return s;
+       }
+       
+       UErrorCode status = U_ZERO_ERROR;
+       BreakIterator* wordBreaker =
+               BreakIterator::createWordInstance(Locale::getUS(), status);
+       if ( U_FAILURE(status) ) {
+               cerr << "error creating BreakIterator" << endl;
+               exit(1);
+       }
+
+       wordBreaker->setText(s);        
+       vector<int32_t> vbreak;
+
+       int32_t pos = wordBreaker->first();
+       while( pos != BreakIterator::DONE ) {
+               // cout << "boundary " << pos << endl;
+               vbreak.push_back(pos);
+               pos = wordBreaker->next();
+       }
+
+       // only one word found, trim and done
+       if ( vbreak.size() == 1 ) {
+               UnicodeString ss(s);
+               return ss.trim();
+       }
+       
+       UnicodeString rs;
+       for (int i = 0 ; i < vbreak.size() - 1; i++) {
+               UnicodeString ss;
+               s.extractBetween(vbreak[i], vbreak[i+1], ss);
+               ss.trim();
+               if ( ss != "" )
+                       rs += ss + " ";
+       }
+
+       return rs.trim();
+}
+
+// split a unicode string by word boundaries trying to preserve
+// original spacing
+UnicodeString split_words(const UnicodeString &s) {
+       UnicodeString tempStr;
+       UnicodeString rs;
+       for (int i = 0 ; i < s.length() ; i++) {
+               if ( ! u_isUWhiteSpace(s[i]) ) {
+                       tempStr += s[i];
+               } else {
+                       if ( tempStr.length() > 0 ) {
+                               rs += split_words_consolidated(tempStr);
+                               tempStr.remove();
+                       }
+                       rs += s[i];
+               }
+       }
+       if ( tempStr.length() > 0 )
+               rs += split_words_consolidated(tempStr);
+       return rs;
+}
+
+// split stdin
+void split_stdin() {
+       UFILE *in = u_finit(stdin, NULL, NULL);
+       if ( !in ) {
+               cerr << "error: u_finit of stdin failed" << endl;
+               exit(1);
+       }
+
+       UChar uch;
+       UnicodeString line;
+       while ( (uch = u_fgetc(in)) ) {
+               if ( uch == 0xffff ) {
+                       break;
+               } else if ( uch == '\n' ) {
+                       UnicodeString s(line);
+                       cout << split_words(s) << endl;
+                       line = "";
+               } else {
+                       line += uch;
+               }
+       }
+               
+       u_fclose(in);
+}
+
+// read file line by line, spliting each line 1 at a time
+void split_file(const char* filename) {
+       UFILE *in = u_fopen(filename, "r", NULL, NULL);
+       if ( !in ) {
+               cerr << "error: opening file " << filename << endl;
+               exit(1);
+       }
+       const int32_t maxLine = 1024;
+       UChar line[maxLine];
+       while ( u_fgets(line, maxLine, in) != NULL ) {
+               //cout << split_words(line) << endl;
+               cout << split_words(line);
+       }
+
+       u_fclose(in);
+}
+
+// check if file is "readable"
+bool is_readable(const char* fname) {
+    ifstream infile(fname);
+    return infile.good();
+}
+
+int main(int argc, char **argv) {
+       // utf8 for everything
+       ucnv_setDefaultName("UTF-8");
+
+       // read stdin when no args passed in
+       if ( argc <= 1 ) {
+               split_stdin();
+               exit(0);
+       }
+
+       // check second arg for help flag
+       UnicodeString arg2(argv[1]);
+       if ( arg2 == "-h" || arg2 == "-H" || arg2 == "-?" || arg2 == "-help" ) {
+               usage();
+               exit(0);
+       }
+
+       // if only one arg and exists with arg name, process file
+       if ( argc == 2 && is_readable(argv[1]) ) {
+               split_file(argv[1]);
+               exit(0);
+       }
+
+       // join remainder of args and process as string
+       UnicodeString inArgs;
+       for ( int i = 1 ; i < argc ; i++ ) {
+               UnicodeString s(argv[i]);
+               inArgs += s;
+               if ( i < (argc - 1) )
+                       inArgs += " ";
+       }
+       cout << split_words(inArgs) << endl;
+       exit(0);
+}
Index: pkgsrc/textproc/split-thai/files/st-swath
diff -u /dev/null pkgsrc/textproc/split-thai/files/st-swath:1.1
--- /dev/null   Thu Aug 13 20:52:09 2020
+++ pkgsrc/textproc/split-thai/files/st-swath   Thu Aug 13 20:52:09 2020
@@ -0,0 +1,42 @@
+#!/bin/sh
+#
+# simple wrapper for swath to split thai text from stdin, arg, or a
+# file
+#
+# swath settings are split with ' ', longest match, unicode input, and
+# unicode output.  see swath(1)
+#
+
+# use merged dictionary unless specified otherwise
+if [ -z "$SWATHDICT" ]; then
+    dictarg="-d ST_SHARE_DIR/thaidict.tri"
+fi
+
+if [ "$#" -eq 0 ]; then
+    # no args, read from stdin
+    while read line
+    do
+       echo "$line" | swath -b ' ' -m long -u 'u,u' $dictarg
+    done < /dev/stdin
+    exit 0 
+elif [ "$#" -eq 1 -a -e "$1" ]; then
+    # one arg and arg is an existing file
+    swath -b ' ' -m long -u 'u,u' $dictarg < "$1"
+    exit $?
+elif [ "$#" -ge 1 ]; then
+    # one or more args, assume it is all text
+    while [ "$1" != "" ]; do
+       if [ -z "$txt" ]; then
+           txt="$1"
+       else
+           txt="$txt $1"
+       fi
+
+       shift
+    done
+    echo "$txt" | swath -b ' ' -m long -u 'u,u' $dictarg
+    exit $?
+else
+    echo "$0: error parsing args"
+    exit 1
+fi
Index: pkgsrc/textproc/split-thai/files/thai-utility.el
diff -u /dev/null pkgsrc/textproc/split-thai/files/thai-utility.el:1.1
--- /dev/null   Thu Aug 13 20:52:09 2020
+++ pkgsrc/textproc/split-thai/files/thai-utility.el    Thu Aug 13 20:52:09 2020
@@ -0,0 +1,97 @@
+(require 'mule-util)
+(require 'thai-word)
+
+" nested-alist from mule-util looks like this:          "
+"  '(3585 1                       ;; ก      word   ก    "
+"    (3591 1                      ;;  ง     word   กง   "
+"          (3585 t                ;;    ก               "
+"                (3634 t          ;;     า              "
+"                      (3619 1))));;      ร word   กงการ"
+"    (3585 1                      ;;  ก     word   กก   "
+"          (3621 1))))            ;;   ล    word   กกล  "
+
+(defun extract-thai-na(nlist thaistr)
+  "helper function to reconstruct thai words from a nested alist,
+uses recursion"
+  (let ((ucode)
+       (complete))
+    (cond
+     ;; finished
+     ((not nlist) nil)
+
+     ;; (3591 1 ...
+     ((integerp (car nlist))
+      ;; xxx care about coding-system vars here?
+      (setq ucode (char-to-string (car nlist)))
+      (setq complete (cadr nlist))
+      (setq thaistr (concat thaistr ucode))
+      (cond
+       ;; t => no word at this depth 
+       ((equal complete t)
+       (extract-thai-na (cddr nlist) thaistr))
+       ;; 1 => word at this depth
+       ((equal complete 1)
+       (append (list thaistr)
+               (extract-thai-na (cddr nlist) thaistr) '()))
+       (t
+       (error "invalid parsing for complete var"))))
+     
+     ;; not finished
+     (t
+      (append (extract-thai-na (car nlist) thaistr)
+             (extract-thai-na (cdr nlist) thaistr) '())))))
+
+(defun thai-word-table-save(filename &optional alist)
+  "save thai words extracted from a nested-alist table to
+filename in utf8 format.  default is to save 'thai-word-table if
+no alist argument given."
+  (interactive)
+  (let ((thaiwords)
+       (elem)
+       (coding-system-for-read 'utf-8)
+       (coding-system-for-write 'utf-8)
+       (buffer-file-coding-system 'utf-8))
+    ;; default list or not
+    (setq alist (or alist
+                   thai-word-table))
+
+    (or (nested-alist-p alist)
+      (error "Invalid argument %s" alist))
+
+    ;; remove 'thai-words from 'thai-word-table
+    (setq alist (cdr alist))
+
+    (with-temp-buffer
+      ;; process per-letter list one at a time.  could process whole
+      ;; list at once but maybe try to conserve memory resources
+      (while (setq elem (car alist))
+       (setq alist (cdr alist))
+       (setq thaiwords (extract-thai-na elem ""))
+       
+       (dolist (elem thaiwords)
+         (insert elem "\n")))
+
+      (sort-lines nil (point-min) (point-max))
+      (write-region nil nil filename)
+      (buffer-string))))
+
+;; 'thai-tis620 is default for emacs <= 28
+(defun thai-update-word-table-utf8 (file &optional append)
+  "Update Thai word table by replacing the current word list with
+FILE, which is in utf-8.  If called with a prefix argument, FILE
+is appended instead to the current word list.  Does the same as
+'thai-update-word-table, except that function expects
+'thai-tis620 encoding"
+  (interactive "FThai word table file: \nP")
+  (let* ((coding-system-for-read 'utf-8)
+        (coding-system-for-write 'utf-8)
+        (buffer-file-coding-system 'utf-8)
+        (temp_file (make-temp-file "thaiutf8_")))
+    (unwind-protect
+       (with-temp-buffer
+         (insert-file-contents file)
+         (setq coding-system-for-write 'thai-tis620)
+         (write-file temp_file))
+      (thai-update-word-table temp_file append)
+      (delete-file temp_file)
+      thai-word-table)))
Index: pkgsrc/textproc/split-thai/files/thaidict.abm
diff -u /dev/null pkgsrc/textproc/split-thai/files/thaidict.abm:1.1
--- /dev/null   Thu Aug 13 20:52:09 2020
+++ pkgsrc/textproc/split-thai/files/thaidict.abm       Thu Aug 13 20:52:09 2020
@@ -0,0 +1,2 @@
+[0x002d,0x002e]
+[0x0e01,0x0e5b]



Home | Main Index | Thread Index | Old Index