Source-Changes-HG archive

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]

[pkgsrc/trunk]: pkgsrc/textproc Add split-thai 0.1, a set of utilities for sp...



details:   https://anonhg.NetBSD.org/pkgsrc/rev/aef0e41b614a
branches:  trunk
changeset: 436982:aef0e41b614a
user:      scole <scole%pkgsrc.org@localhost>
date:      Thu Aug 13 20:52:08 2020 +0000

description:
Add split-thai 0.1, a set of utilities for splitting Thai UTF8 text by word boundaries

diffstat:

 textproc/Makefile                         |    3 +-
 textproc/split-thai/DESCR                 |    6 +
 textproc/split-thai/Makefile              |   81 ++++++++++++
 textproc/split-thai/PLIST                 |    9 +
 textproc/split-thai/distinfo              |    6 +
 textproc/split-thai/files/README.txt      |   49 +++++++
 textproc/split-thai/files/st-emacs        |   54 ++++++++
 textproc/split-thai/files/st-icu.cc       |  195 ++++++++++++++++++++++++++++++
 textproc/split-thai/files/st-swath        |   42 ++++++
 textproc/split-thai/files/thai-utility.el |   97 ++++++++++++++
 textproc/split-thai/files/thaidict.abm    |    2 +
 11 files changed, 543 insertions(+), 1 deletions(-)

diffs (truncated from 598 to 300 lines):

diff -r 7bf95d495d4c -r aef0e41b614a textproc/Makefile
--- a/textproc/Makefile Thu Aug 13 18:30:51 2020 +0000
+++ b/textproc/Makefile Thu Aug 13 20:52:08 2020 +0000
@@ -1,4 +1,4 @@
-# $NetBSD: Makefile,v 1.1164 2020/08/07 02:36:24 brook Exp $
+# $NetBSD: Makefile,v 1.1165 2020/08/13 20:52:08 scole Exp $
 #
 
 COMMENT=       Text processing utilities (does not include desktop publishing)
@@ -1099,6 +1099,7 @@
 SUBDIR+=       sord
 SUBDIR+=       source-highlight
 SUBDIR+=       sphinxsearch
+SUBDIR+=       split-thai
 SUBDIR+=       stardic
 SUBDIR+=       sub2srt
 SUBDIR+=       sublib
diff -r 7bf95d495d4c -r aef0e41b614a textproc/split-thai/DESCR
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/textproc/split-thai/DESCR Thu Aug 13 20:52:08 2020 +0000
@@ -0,0 +1,6 @@
+A collection of utilities to split Thai Unicode UTF-8 text by word
+boundaries, also known as word tokenization.  The utilities use emacs,
+swath, and a c++ icu-project program.  All use dictionary-based word
+splitting.
+
+Also included is merged dictionary file of thai words.
diff -r 7bf95d495d4c -r aef0e41b614a textproc/split-thai/Makefile
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/textproc/split-thai/Makefile      Thu Aug 13 20:52:08 2020 +0000
@@ -0,0 +1,81 @@
+# $NetBSD: Makefile,v 1.1 2020/08/13 20:52:09 scole Exp $
+
+PKGNAME=       split-thai-0.1
+CATEGORIES=    textproc
+MAINTAINER=    pkgsrc-users%NetBSD.org@localhost
+COMMENT=       Utilities to split UTF-8 Thai text into words
+LICENSE=       public-domain AND mit AND gnu-gpl-v2 # code, icu dict, swath dict
+
+# xxx fetching a specific version of a file out of a github project
+EXTRACT_SUFX=  # none
+GITHUB_ICU_TAG=        61607c27732906d36c5bd4d23ecc092f89f53a2b
+DISTFILES=     thaidict-${GITHUB_ICU_TAG}.txt
+MASTER_SITES=  -${MASTER_SITE_GITHUB:=unicode-org/}/icu/raw/${GITHUB_ICU_TAG}/icu4c/source/data/brkitr/dictionaries/thaidict.txt
+
+USE_LANGUAGES= c++11   # darwin needed 11?
+
+USE_TOOLS=     pkg-config mkdir cp sh:run env awk cat sort uniq grep wc echo
+BUILD_DEPENDS+=        libdatrie-[0-9]*:../../devel/libdatrie
+DEPENDS+=      emacs-[0-9]*:../../editors/emacs
+DEPENDS+=      swath-[0-9]*:../../textproc/swath
+
+REPLACE_SH=    st-swath
+
+UTF8_ENV=      env LC_ALL=C.UTF-8
+
+ST_SHARE_DIR=          share/split-thai
+INSTALLATION_DIRS=     bin ${ST_SHARE_DIR}
+
+# xxx REPLACE_EMACS_SCRIPT
+SUBST_CLASSES+=                        st-emacs-app
+SUBST_STAGE.st-emacs-app=      pre-configure
+SUBST_MESSAGE.st-emacs-app=    Fixing emacs script paths.
+SUBST_FILES.st-emacs-app=      st-emacs
+SUBST_SED.st-emacs-app=                -e 's,!/bin/emacs,!${PREFIX}/bin/emacs,g'
+
+SUBST_CLASSES+=                        dictionary-app
+SUBST_STAGE.dictionary-app=    pre-configure
+SUBST_MESSAGE.dictionary-app=  Fixing dictionary paths.
+SUBST_FILES.dictionary-app=    st-emacs st-swath
+SUBST_SED.dictionary-app=      -e 's,ST_SHARE_DIR,${PREFIX}/${ST_SHARE_DIR},g'
+
+pre-extract:
+       mkdir -p ${WRKSRC}
+       cd files && cp README.txt st-emacs st-icu.cc st-swath \
+               thai-utility.el thaidict.abm ${WRKSRC}
+
+post-extract:
+       cd ${WRKSRC} && ${UTF8_ENV} emacs --batch \
+               -f batch-byte-compile thai-utility.el
+       cd ${WRKSRC} && ${UTF8_ENV} emacs --batch -l thai-utility.el \
+               --eval '(thai-word-table-save "emacs-dict")'
+       cp ${WRKDIR}/${DISTFILES} ${WRKSRC}/icu-dict
+       cd ${PREFIX}/share/swath && \
+               ${UTF8_ENV} trietool swathdic list | \
+               awk '{print $$1}' > ${WRKSRC}/swath-dict
+       cd ${WRKSRC} && \
+               ${UTF8_ENV} cat icu-dict swath-dict emacs-dict | \
+                       grep -v '#' | sort | uniq > thaidict
+       cd ${WRKSRC} && \
+               ${UTF8_ENV} trietool thaidict add-list -e utf-8 thaidict
+.for i in emacs-dict icu-dict swath-dict
+       @${ECHO} `wc -l ${WRKSRC}/${i} | awk '{print $$1}'` words in ${i}
+.endfor
+       @${ECHO} `wc -l ${WRKSRC}/thaidict | awk '{print $$1}'` \
+               unique words in combined dictionary
+
+do-build:
+       cd ${WRKSRC} && \
+               ${CXX} ${CPPFLAGS} -o st-icu st-icu.cc \
+               `pkg-config --libs --cflags icu-io`
+
+do-install:
+       ${INSTALL_SCRIPT} ${WRKSRC}/st-emacs ${WRKSRC}/st-swath \
+               ${DESTDIR}${PREFIX}/bin
+       ${INSTALL_PROGRAM} ${WRKSRC}/st-icu ${DESTDIR}${PREFIX}/bin
+.for i in README.txt thaidict thai-utility.el thai-utility.elc thaidict.tri
+       ${INSTALL_DATA} ${WRKSRC}/${i} ${DESTDIR}${PREFIX}/share/split-thai
+.endfor
+
+.include "../../textproc/icu/buildlink3.mk"
+.include "../../mk/bsd.pkg.mk"
diff -r 7bf95d495d4c -r aef0e41b614a textproc/split-thai/PLIST
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/textproc/split-thai/PLIST Thu Aug 13 20:52:08 2020 +0000
@@ -0,0 +1,9 @@
+@comment $NetBSD: PLIST,v 1.1 2020/08/13 20:52:09 scole Exp $
+bin/st-emacs
+bin/st-icu
+bin/st-swath
+share/split-thai/README.txt
+share/split-thai/thai-utility.el
+share/split-thai/thai-utility.elc
+share/split-thai/thaidict
+share/split-thai/thaidict.tri
diff -r 7bf95d495d4c -r aef0e41b614a textproc/split-thai/distinfo
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/textproc/split-thai/distinfo      Thu Aug 13 20:52:08 2020 +0000
@@ -0,0 +1,6 @@
+$NetBSD: distinfo,v 1.1 2020/08/13 20:52:09 scole Exp $
+
+SHA1 (thaidict-61607c27732906d36c5bd4d23ecc092f89f53a2b.txt) = 2a2ad127cc279835cb4df04eb69401a0d4927774
+RMD160 (thaidict-61607c27732906d36c5bd4d23ecc092f89f53a2b.txt) = 0a6df7b7dd6ef502c5dd20020e37b2ca1a5514a2
+SHA512 (thaidict-61607c27732906d36c5bd4d23ecc092f89f53a2b.txt) = 88800fe2a453fc40f16ff54c21c852a8ea8e1496e42d5d187e5b5ac0ff58050830fc0816239e4f88cb23ed301f894d1ca52eb4676fd85c13c285cec815ae7c42
+Size (thaidict-61607c27732906d36c5bd4d23ecc092f89f53a2b.txt) = 493044 bytes
diff -r 7bf95d495d4c -r aef0e41b614a textproc/split-thai/files/README.txt
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/textproc/split-thai/files/README.txt      Thu Aug 13 20:52:08 2020 +0000
@@ -0,0 +1,49 @@
+This is a collection of utilities to separate Thai words by spaces
+(word tokenization).  They can separate stdin, files, or text as
+arguments.  It includes 3 separate utilities:
+
+st-emacs:  emacs-script using emacs lisp thai-word library
+           https://www.gnu.org/software/emacs/
+st-icu:    basic C++ program using the ICU library
+           http://site.icu-project.org/
+st-swath:  sh script wrapper to simplfy args to the swath program
+           https://linux.thai.net/projects/swath
+
+All scripts should be able to take a filename, stdin, or arguments as
+input, e.g., :
+
+      # st-swath แมวและหมา
+or
+      # echo "แมวและหมา" | st-swath
+or      
+      # st-swath < thaifile.txt
+or
+      # st-swath "แมวหมา" พ่อและแม่
+      
+You will most likely need to set LC_ALL or LC_CTYPE to an approriate
+unicode value, e.g., en_US.UTF-8 or C.UTF-8, in the environment for
+them to work properly.  These tools are setup to only support UTF-8
+encodings.
+
+Note that it is not possible to split Thai words 100% accurately
+without context and meaning.  These programs use dictionary-based word
+splitting.
+
+Also included in the package is a combined thai word dictionary and
+corresponding .tri file, and emacs lisp .el file for reading and
+dumping out dictionary files.
+
+st-emacs and st-swath are setup to use the combined dictionary with
+words from the emacs 'thai-word library, swath dictionary words, and
+the icu thai library words.
+
+st-icu uses its own built in library.  To customise the icu
+dictionary, you apparently would have to modify
+  icu4c/source/data/brkitr/dictionaries/thaidict.txt
+and rebuild icu library, and then rebuild the whole thing.
+
+There is also 
+
+See also swath(1), libthai(1), emacs(1), locale(1), uconv(1), iconv(1)
+
+TODO - fix st-icu to use all the combined dictionary words.
diff -r 7bf95d495d4c -r aef0e41b614a textproc/split-thai/files/st-emacs
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/textproc/split-thai/files/st-emacs        Thu Aug 13 20:52:08 2020 +0000
@@ -0,0 +1,54 @@
+#!/bin/emacs --script
+;;
+;; break thai string into words separated by spaces
+;;
+;; - if no args, process stdin
+;; - if one arg and file exists with arg name, process file
+;; - else join get remainder of args and process
+;;
+
+;;(toggle-debug-on-error) ;; debug
+(require 'thai-word)
+
+;; load custom dictionary
+(load "ST_SHARE_DIR/thai-utility" nil t)
+(thai-update-word-table-utf8 "ST_SHARE_DIR/thaidict")
+
+;; split a thai line by spaces, return new line
+(defun process-thai-line(line)
+  (with-temp-buffer
+    (insert line)
+    (goto-char (point-min))
+    (thai-break-words " ")
+    (buffer-string)))
+
+;; hack to process stdin
+(defun process-stdin()
+  (condition-case nil
+      (let (aline)
+       (while (setq aline (read-from-minibuffer ""))
+         (princ (process-thai-line aline))
+         (princ "\n")))
+    (error nil)))
+
+;; process arguments, remove "emacs -scriptload scriptname" from args,
+;; join the rest by spaces
+(setq args (cdddr command-line-args))
+(setq argc (length args))
+
+;; no args => process stdin
+(when (= 0 argc)
+  (process-stdin)
+  (kill-emacs 0))
+
+;; if one arg and arg is a file, process that file
+;; else process all input args joined by spaces with an added newline
+(with-temp-buffer
+  (if (and (= 1 argc) (file-exists-p (car args)))
+      (insert-file-contents (car args))
+    (insert (mapconcat 'identity (cdddr command-line-args) " "))
+    (insert "\n"))
+  (goto-char (point-min))
+  (thai-break-words " ")
+  (write-region nil nil "/dev/stdout"))
+(kill-emacs 0)
diff -r 7bf95d495d4c -r aef0e41b614a textproc/split-thai/files/st-icu.cc
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/textproc/split-thai/files/st-icu.cc       Thu Aug 13 20:52:08 2020 +0000
@@ -0,0 +1,195 @@
+/*
+ *   split up thai strings in a file, stdin or args into "words"
+ */
+#include <fstream>
+#include <vector>
+
+#include <unicode/brkiter.h>
+#include <unicode/regex.h>
+#include <unicode/ucnv.h>
+#include <unicode/ustream.h>
+#include <unicode/ustdio.h>
+
+using namespace std;
+using namespace icu;
+
+void usage() {
+ const char *progname = "st-icu";
+       
+ cout << endl <<
+  "Usage: " << progname << " [stdin|filename|thaiarg1 thaiarg2 ...]" <<
+    endl << endl <<
+     "This program attempts to split thai strings into thai words." << endl <<
+     "It takes a filename, stdin, or UTF8 thai string(s) as arguments" << endl <<
+     "and prints out the string separated by spaces." << endl <<
+     "When no argument is given, it can read lines from stdin, and" << endl <<
+     "separate thai words in the line by spaces." << endl << endl <<
+     "returns 0 on succes, or non-zero otherwise" << endl << endl;
+}
+
+// return true if string contains any thai unicode
+bool contains_thai(const UnicodeString &s) {
+       UErrorCode status = U_ZERO_ERROR;
+       // matches one or more thai chars, \u0e01-\u0e5b should work too
+       RegexMatcher *matcher = new RegexMatcher("[\u0e00-\u0e7f]+", 0, status);
+
+       if (U_FAILURE(status)) {
+               // syntax errors in the regular expression
+               cerr << "error creating RegexMatcher" << endl;
+               exit(1);
+       }
+
+       matcher->reset(s);
+       if (matcher->find())
+               return true;
+       else
+               return false;
+}
+
+// split a unicode string by word boundaries.  if arg contains
+// whitespaces, it will get consolidated to single spaces.


Home | Main Index | Thread Index | Old Index