Source-Changes-HG archive
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]
[pkgsrc/trunk]: pkgsrc/textproc Add split-thai 0.1, a set of utilities for sp...
details: https://anonhg.NetBSD.org/pkgsrc/rev/aef0e41b614a
branches: trunk
changeset: 436982:aef0e41b614a
user: scole <scole%pkgsrc.org@localhost>
date: Thu Aug 13 20:52:08 2020 +0000
description:
Add split-thai 0.1, a set of utilities for splitting Thai UTF8 text by word boundaries
diffstat:
textproc/Makefile | 3 +-
textproc/split-thai/DESCR | 6 +
textproc/split-thai/Makefile | 81 ++++++++++++
textproc/split-thai/PLIST | 9 +
textproc/split-thai/distinfo | 6 +
textproc/split-thai/files/README.txt | 49 +++++++
textproc/split-thai/files/st-emacs | 54 ++++++++
textproc/split-thai/files/st-icu.cc | 195 ++++++++++++++++++++++++++++++
textproc/split-thai/files/st-swath | 42 ++++++
textproc/split-thai/files/thai-utility.el | 97 ++++++++++++++
textproc/split-thai/files/thaidict.abm | 2 +
11 files changed, 543 insertions(+), 1 deletions(-)
diffs (truncated from 598 to 300 lines):
diff -r 7bf95d495d4c -r aef0e41b614a textproc/Makefile
--- a/textproc/Makefile Thu Aug 13 18:30:51 2020 +0000
+++ b/textproc/Makefile Thu Aug 13 20:52:08 2020 +0000
@@ -1,4 +1,4 @@
-# $NetBSD: Makefile,v 1.1164 2020/08/07 02:36:24 brook Exp $
+# $NetBSD: Makefile,v 1.1165 2020/08/13 20:52:08 scole Exp $
#
COMMENT= Text processing utilities (does not include desktop publishing)
@@ -1099,6 +1099,7 @@
SUBDIR+= sord
SUBDIR+= source-highlight
SUBDIR+= sphinxsearch
+SUBDIR+= split-thai
SUBDIR+= stardic
SUBDIR+= sub2srt
SUBDIR+= sublib
diff -r 7bf95d495d4c -r aef0e41b614a textproc/split-thai/DESCR
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/textproc/split-thai/DESCR Thu Aug 13 20:52:08 2020 +0000
@@ -0,0 +1,6 @@
+A collection of utilities to split Thai Unicode UTF-8 text by word
+boundaries, also known as word tokenization. The utilities use emacs,
+swath, and a c++ icu-project program. All use dictionary-based word
+splitting.
+
+Also included is merged dictionary file of thai words.
diff -r 7bf95d495d4c -r aef0e41b614a textproc/split-thai/Makefile
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/textproc/split-thai/Makefile Thu Aug 13 20:52:08 2020 +0000
@@ -0,0 +1,81 @@
+# $NetBSD: Makefile,v 1.1 2020/08/13 20:52:09 scole Exp $
+
+PKGNAME= split-thai-0.1
+CATEGORIES= textproc
+MAINTAINER= pkgsrc-users%NetBSD.org@localhost
+COMMENT= Utilities to split UTF-8 Thai text into words
+LICENSE= public-domain AND mit AND gnu-gpl-v2 # code, icu dict, swath dict
+
+# xxx fetching a specific version of a file out of a github project
+EXTRACT_SUFX= # none
+GITHUB_ICU_TAG= 61607c27732906d36c5bd4d23ecc092f89f53a2b
+DISTFILES= thaidict-${GITHUB_ICU_TAG}.txt
+MASTER_SITES= -${MASTER_SITE_GITHUB:=unicode-org/}/icu/raw/${GITHUB_ICU_TAG}/icu4c/source/data/brkitr/dictionaries/thaidict.txt
+
+USE_LANGUAGES= c++11 # darwin needed 11?
+
+USE_TOOLS= pkg-config mkdir cp sh:run env awk cat sort uniq grep wc echo
+BUILD_DEPENDS+= libdatrie-[0-9]*:../../devel/libdatrie
+DEPENDS+= emacs-[0-9]*:../../editors/emacs
+DEPENDS+= swath-[0-9]*:../../textproc/swath
+
+REPLACE_SH= st-swath
+
+UTF8_ENV= env LC_ALL=C.UTF-8
+
+ST_SHARE_DIR= share/split-thai
+INSTALLATION_DIRS= bin ${ST_SHARE_DIR}
+
+# xxx REPLACE_EMACS_SCRIPT
+SUBST_CLASSES+= st-emacs-app
+SUBST_STAGE.st-emacs-app= pre-configure
+SUBST_MESSAGE.st-emacs-app= Fixing emacs script paths.
+SUBST_FILES.st-emacs-app= st-emacs
+SUBST_SED.st-emacs-app= -e 's,!/bin/emacs,!${PREFIX}/bin/emacs,g'
+
+SUBST_CLASSES+= dictionary-app
+SUBST_STAGE.dictionary-app= pre-configure
+SUBST_MESSAGE.dictionary-app= Fixing dictionary paths.
+SUBST_FILES.dictionary-app= st-emacs st-swath
+SUBST_SED.dictionary-app= -e 's,ST_SHARE_DIR,${PREFIX}/${ST_SHARE_DIR},g'
+
+pre-extract:
+ mkdir -p ${WRKSRC}
+ cd files && cp README.txt st-emacs st-icu.cc st-swath \
+ thai-utility.el thaidict.abm ${WRKSRC}
+
+post-extract:
+ cd ${WRKSRC} && ${UTF8_ENV} emacs --batch \
+ -f batch-byte-compile thai-utility.el
+ cd ${WRKSRC} && ${UTF8_ENV} emacs --batch -l thai-utility.el \
+ --eval '(thai-word-table-save "emacs-dict")'
+ cp ${WRKDIR}/${DISTFILES} ${WRKSRC}/icu-dict
+ cd ${PREFIX}/share/swath && \
+ ${UTF8_ENV} trietool swathdic list | \
+ awk '{print $$1}' > ${WRKSRC}/swath-dict
+ cd ${WRKSRC} && \
+ ${UTF8_ENV} cat icu-dict swath-dict emacs-dict | \
+ grep -v '#' | sort | uniq > thaidict
+ cd ${WRKSRC} && \
+ ${UTF8_ENV} trietool thaidict add-list -e utf-8 thaidict
+.for i in emacs-dict icu-dict swath-dict
+ @${ECHO} `wc -l ${WRKSRC}/${i} | awk '{print $$1}'` words in ${i}
+.endfor
+ @${ECHO} `wc -l ${WRKSRC}/thaidict | awk '{print $$1}'` \
+ unique words in combined dictionary
+
+do-build:
+ cd ${WRKSRC} && \
+ ${CXX} ${CPPFLAGS} -o st-icu st-icu.cc \
+ `pkg-config --libs --cflags icu-io`
+
+do-install:
+ ${INSTALL_SCRIPT} ${WRKSRC}/st-emacs ${WRKSRC}/st-swath \
+ ${DESTDIR}${PREFIX}/bin
+ ${INSTALL_PROGRAM} ${WRKSRC}/st-icu ${DESTDIR}${PREFIX}/bin
+.for i in README.txt thaidict thai-utility.el thai-utility.elc thaidict.tri
+ ${INSTALL_DATA} ${WRKSRC}/${i} ${DESTDIR}${PREFIX}/share/split-thai
+.endfor
+
+.include "../../textproc/icu/buildlink3.mk"
+.include "../../mk/bsd.pkg.mk"
diff -r 7bf95d495d4c -r aef0e41b614a textproc/split-thai/PLIST
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/textproc/split-thai/PLIST Thu Aug 13 20:52:08 2020 +0000
@@ -0,0 +1,9 @@
+@comment $NetBSD: PLIST,v 1.1 2020/08/13 20:52:09 scole Exp $
+bin/st-emacs
+bin/st-icu
+bin/st-swath
+share/split-thai/README.txt
+share/split-thai/thai-utility.el
+share/split-thai/thai-utility.elc
+share/split-thai/thaidict
+share/split-thai/thaidict.tri
diff -r 7bf95d495d4c -r aef0e41b614a textproc/split-thai/distinfo
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/textproc/split-thai/distinfo Thu Aug 13 20:52:08 2020 +0000
@@ -0,0 +1,6 @@
+$NetBSD: distinfo,v 1.1 2020/08/13 20:52:09 scole Exp $
+
+SHA1 (thaidict-61607c27732906d36c5bd4d23ecc092f89f53a2b.txt) = 2a2ad127cc279835cb4df04eb69401a0d4927774
+RMD160 (thaidict-61607c27732906d36c5bd4d23ecc092f89f53a2b.txt) = 0a6df7b7dd6ef502c5dd20020e37b2ca1a5514a2
+SHA512 (thaidict-61607c27732906d36c5bd4d23ecc092f89f53a2b.txt) = 88800fe2a453fc40f16ff54c21c852a8ea8e1496e42d5d187e5b5ac0ff58050830fc0816239e4f88cb23ed301f894d1ca52eb4676fd85c13c285cec815ae7c42
+Size (thaidict-61607c27732906d36c5bd4d23ecc092f89f53a2b.txt) = 493044 bytes
diff -r 7bf95d495d4c -r aef0e41b614a textproc/split-thai/files/README.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/textproc/split-thai/files/README.txt Thu Aug 13 20:52:08 2020 +0000
@@ -0,0 +1,49 @@
+This is a collection of utilities to separate Thai words by spaces
+(word tokenization). They can separate stdin, files, or text as
+arguments. It includes 3 separate utilities:
+
+st-emacs: emacs-script using emacs lisp thai-word library
+ https://www.gnu.org/software/emacs/
+st-icu: basic C++ program using the ICU library
+ http://site.icu-project.org/
+st-swath: sh script wrapper to simplfy args to the swath program
+ https://linux.thai.net/projects/swath
+
+All scripts should be able to take a filename, stdin, or arguments as
+input, e.g., :
+
+ # st-swath แมวและหมา
+or
+ # echo "แมวและหมา" | st-swath
+or
+ # st-swath < thaifile.txt
+or
+ # st-swath "แมวหมา" พ่อและแม่
+
+You will most likely need to set LC_ALL or LC_CTYPE to an approriate
+unicode value, e.g., en_US.UTF-8 or C.UTF-8, in the environment for
+them to work properly. These tools are setup to only support UTF-8
+encodings.
+
+Note that it is not possible to split Thai words 100% accurately
+without context and meaning. These programs use dictionary-based word
+splitting.
+
+Also included in the package is a combined thai word dictionary and
+corresponding .tri file, and emacs lisp .el file for reading and
+dumping out dictionary files.
+
+st-emacs and st-swath are setup to use the combined dictionary with
+words from the emacs 'thai-word library, swath dictionary words, and
+the icu thai library words.
+
+st-icu uses its own built in library. To customise the icu
+dictionary, you apparently would have to modify
+ icu4c/source/data/brkitr/dictionaries/thaidict.txt
+and rebuild icu library, and then rebuild the whole thing.
+
+There is also
+
+See also swath(1), libthai(1), emacs(1), locale(1), uconv(1), iconv(1)
+
+TODO - fix st-icu to use all the combined dictionary words.
diff -r 7bf95d495d4c -r aef0e41b614a textproc/split-thai/files/st-emacs
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/textproc/split-thai/files/st-emacs Thu Aug 13 20:52:08 2020 +0000
@@ -0,0 +1,54 @@
+#!/bin/emacs --script
+;;
+;; break thai string into words separated by spaces
+;;
+;; - if no args, process stdin
+;; - if one arg and file exists with arg name, process file
+;; - else join get remainder of args and process
+;;
+
+;;(toggle-debug-on-error) ;; debug
+(require 'thai-word)
+
+;; load custom dictionary
+(load "ST_SHARE_DIR/thai-utility" nil t)
+(thai-update-word-table-utf8 "ST_SHARE_DIR/thaidict")
+
+;; split a thai line by spaces, return new line
+(defun process-thai-line(line)
+ (with-temp-buffer
+ (insert line)
+ (goto-char (point-min))
+ (thai-break-words " ")
+ (buffer-string)))
+
+;; hack to process stdin
+(defun process-stdin()
+ (condition-case nil
+ (let (aline)
+ (while (setq aline (read-from-minibuffer ""))
+ (princ (process-thai-line aline))
+ (princ "\n")))
+ (error nil)))
+
+;; process arguments, remove "emacs -scriptload scriptname" from args,
+;; join the rest by spaces
+(setq args (cdddr command-line-args))
+(setq argc (length args))
+
+;; no args => process stdin
+(when (= 0 argc)
+ (process-stdin)
+ (kill-emacs 0))
+
+;; if one arg and arg is a file, process that file
+;; else process all input args joined by spaces with an added newline
+(with-temp-buffer
+ (if (and (= 1 argc) (file-exists-p (car args)))
+ (insert-file-contents (car args))
+ (insert (mapconcat 'identity (cdddr command-line-args) " "))
+ (insert "\n"))
+ (goto-char (point-min))
+ (thai-break-words " ")
+ (write-region nil nil "/dev/stdout"))
+(kill-emacs 0)
diff -r 7bf95d495d4c -r aef0e41b614a textproc/split-thai/files/st-icu.cc
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/textproc/split-thai/files/st-icu.cc Thu Aug 13 20:52:08 2020 +0000
@@ -0,0 +1,195 @@
+/*
+ * split up thai strings in a file, stdin or args into "words"
+ */
+#include <fstream>
+#include <vector>
+
+#include <unicode/brkiter.h>
+#include <unicode/regex.h>
+#include <unicode/ucnv.h>
+#include <unicode/ustream.h>
+#include <unicode/ustdio.h>
+
+using namespace std;
+using namespace icu;
+
+void usage() {
+ const char *progname = "st-icu";
+
+ cout << endl <<
+ "Usage: " << progname << " [stdin|filename|thaiarg1 thaiarg2 ...]" <<
+ endl << endl <<
+ "This program attempts to split thai strings into thai words." << endl <<
+ "It takes a filename, stdin, or UTF8 thai string(s) as arguments" << endl <<
+ "and prints out the string separated by spaces." << endl <<
+ "When no argument is given, it can read lines from stdin, and" << endl <<
+ "separate thai words in the line by spaces." << endl << endl <<
+ "returns 0 on succes, or non-zero otherwise" << endl << endl;
+}
+
+// return true if string contains any thai unicode
+bool contains_thai(const UnicodeString &s) {
+ UErrorCode status = U_ZERO_ERROR;
+ // matches one or more thai chars, \u0e01-\u0e5b should work too
+ RegexMatcher *matcher = new RegexMatcher("[\u0e00-\u0e7f]+", 0, status);
+
+ if (U_FAILURE(status)) {
+ // syntax errors in the regular expression
+ cerr << "error creating RegexMatcher" << endl;
+ exit(1);
+ }
+
+ matcher->reset(s);
+ if (matcher->find())
+ return true;
+ else
+ return false;
+}
+
+// split a unicode string by word boundaries. if arg contains
+// whitespaces, it will get consolidated to single spaces.
Home |
Main Index |
Thread Index |
Old Index