pkgsrc-Changes archive
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]
CVS commit: pkgsrc/textproc/split-thai
Module Name: pkgsrc
Committed By: scole
Date: Mon Aug 17 17:43:15 UTC 2020
Modified Files:
pkgsrc/textproc/split-thai: Makefile
pkgsrc/textproc/split-thai/files: README.txt st-emacs st-icu.cc
st-swath thai-utility.el
Log Message:
Update to 0.4
- always use pkgsrc path for swath for st-swath script
- make splitting of numbers a little more consistent for st-emacs & st-icu
- add split-thai, split-thai-line, wrapper functions to emacs lisp code
To generate a diff of this commit:
cvs rdiff -u -r1.3 -r1.4 pkgsrc/textproc/split-thai/Makefile
cvs rdiff -u -r1.2 -r1.3 pkgsrc/textproc/split-thai/files/README.txt
cvs rdiff -u -r1.3 -r1.4 pkgsrc/textproc/split-thai/files/st-emacs \
pkgsrc/textproc/split-thai/files/thai-utility.el
cvs rdiff -u -r1.1 -r1.2 pkgsrc/textproc/split-thai/files/st-icu.cc \
pkgsrc/textproc/split-thai/files/st-swath
Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.
Modified files:
Index: pkgsrc/textproc/split-thai/Makefile
diff -u pkgsrc/textproc/split-thai/Makefile:1.3 pkgsrc/textproc/split-thai/Makefile:1.4
--- pkgsrc/textproc/split-thai/Makefile:1.3 Sat Aug 15 16:52:28 2020
+++ pkgsrc/textproc/split-thai/Makefile Mon Aug 17 17:43:15 2020
@@ -1,6 +1,6 @@
-# $NetBSD: Makefile,v 1.3 2020/08/15 16:52:28 scole Exp $
+# $NetBSD: Makefile,v 1.4 2020/08/17 17:43:15 scole Exp $
-PKGNAME= split-thai-0.3
+PKGNAME= split-thai-0.4
CATEGORIES= textproc
MAINTAINER= pkgsrc-users%NetBSD.org@localhost
COMMENT= Utilities to split UTF-8 Thai text into words
@@ -24,7 +24,8 @@ REPLACE_SH= st-swath
UTF8_ENV= env LC_ALL=C.UTF-8
ST_SHARE_DIR= share/split-thai
-INSTALLATION_DIRS= bin ${ST_SHARE_DIR}
+ST_SHARE_BIN= bin
+INSTALLATION_DIRS= ${ST_SHARE_BIN} ${ST_SHARE_DIR}
ST_SHARE_FILES= README.txt thaidict thai-dict.el thai-dict.elc
ST_SHARE_FILES+= thai-utility.el thai-utility.elc thaidict.tri
@@ -41,6 +42,7 @@ SUBST_STAGE.dictionary-app= pre-configur
SUBST_MESSAGE.dictionary-app= Fixing dictionary paths.
SUBST_FILES.dictionary-app= st-emacs st-swath
SUBST_SED.dictionary-app= -e 's,ST_SHARE_DIR,${PREFIX}/${ST_SHARE_DIR},g'
+SUBST_SED.dictionary-app+= -e 's,ST_SHARE_BIN,${PREFIX}/${ST_SHARE_BIN},g'
pre-extract:
mkdir -p ${WRKSRC}
Index: pkgsrc/textproc/split-thai/files/README.txt
diff -u pkgsrc/textproc/split-thai/files/README.txt:1.2 pkgsrc/textproc/split-thai/files/README.txt:1.3
--- pkgsrc/textproc/split-thai/files/README.txt:1.2 Fri Aug 14 17:31:34 2020
+++ pkgsrc/textproc/split-thai/files/README.txt Mon Aug 17 17:43:15 2020
@@ -66,5 +66,5 @@ SEE ALSO
BUGS
st-icu should also use the combined dictionary words.
- st-emacs and st-icu don't always split thai numbers well.
+ thai text mixed with other languages may not be handled well.
this file should be converted to a proper manpage.
Index: pkgsrc/textproc/split-thai/files/st-emacs
diff -u pkgsrc/textproc/split-thai/files/st-emacs:1.3 pkgsrc/textproc/split-thai/files/st-emacs:1.4
--- pkgsrc/textproc/split-thai/files/st-emacs:1.3 Sat Aug 15 16:52:29 2020
+++ pkgsrc/textproc/split-thai/files/st-emacs Mon Aug 17 17:43:15 2020
@@ -18,7 +18,7 @@
(with-temp-buffer
(insert line)
(goto-char (point-min))
- (thai-break-words " ")
+ (split-thai-line)
(buffer-string)))
;; hack to process stdin
@@ -48,6 +48,6 @@
(insert (mapconcat 'identity (cdddr command-line-args) " "))
(insert "\n"))
(goto-char (point-min))
- (thai-break-words " ")
+ (split-thai)
(write-region nil nil "/dev/stdout"))
(kill-emacs 0)
Index: pkgsrc/textproc/split-thai/files/thai-utility.el
diff -u pkgsrc/textproc/split-thai/files/thai-utility.el:1.3 pkgsrc/textproc/split-thai/files/thai-utility.el:1.4
--- pkgsrc/textproc/split-thai/files/thai-utility.el:1.3 Sat Aug 15 16:52:29 2020
+++ pkgsrc/textproc/split-thai/files/thai-utility.el Mon Aug 17 17:43:15 2020
@@ -168,15 +168,38 @@ dictionary words."
(write-region nil nil lispfile))
line_count))
-(defun split-thai-line(&optional separator)
+(defun split-thai-line()
"Break Thai words from point to end of line by inserting a
separator string at word boundaries. (wrapper for 'thai-break-words)"
(interactive)
- (thai-break-words (or separator " ") (line-end-position)))
+ (thai-break-words " " (line-end-position))
+ (split-thai-numbers (point) (line-end-position)))
-(defun split-thai(&optional separator)
+(defun split-thai()
"Break Thai words from point to end of buffer by inserting a
separator string at word boundaries. (wrapper for
'thai-break-words)"
(interactive)
- (thai-break-words (or separator " ") (point-max)))
+ (thai-break-words " " (point-max))
+ (split-thai-numbers (point) (point-max)))
+
+(defun split-thai-numbers(start_point end_point)
+ "helper function to separate numbers in a buffer.
+'thai-break-words doesn't always split numbers properly. this may
+improve tokenization somewhat."
+ ;; xxx this really should be fixed in 'thai-word lib
+ (let* (
+ ;; "\\([๐๑๒๓๔๕๖๗๘๙0123456789]+\\)"
+ (num_rexp "\\([\u0e50-\u0e59]+\\)") ;; thai numbers
+ (nonnum_rexp "\\([\u0e00-\u0e4f\u0e5a-\u0e7f]\\)") ;; "non-numbers"
+ (trailing_rexp (concat num_rexp nonnum_rexp))
+ (leading_rexp (concat nonnum_rexp num_rexp)))
+ (save-restriction
+ (narrow-to-region start_point end_point)
+ (goto-char (point-min))
+ (while (search-forward-regexp trailing_rexp nil t)
+ (replace-match (concat (match-string 1) " " (match-string 2))))
+ (goto-char (point-min))
+ (while (search-forward-regexp leading_rexp nil t)
+ (replace-match (concat (match-string 1) " " (match-string 2))))
+ (goto-char start_point))))
Index: pkgsrc/textproc/split-thai/files/st-icu.cc
diff -u pkgsrc/textproc/split-thai/files/st-icu.cc:1.1 pkgsrc/textproc/split-thai/files/st-icu.cc:1.2
--- pkgsrc/textproc/split-thai/files/st-icu.cc:1.1 Thu Aug 13 20:52:09 2020
+++ pkgsrc/textproc/split-thai/files/st-icu.cc Mon Aug 17 17:43:15 2020
@@ -13,6 +13,13 @@
using namespace std;
using namespace icu;
+// utf-8 unicode thai values
+// 0x0e1 - 0x0e5b should work for thai_rexp as well...
+const UnicodeString thai_rexp = "[\\u0e00-\\u0e7f]+";
+const UnicodeString thai_consonant = "[\\u0e01-\\u0e2e]+";
+const UnicodeString thai_num_rexp = "[\\u0e50-\\u0e59]+";
+const UnicodeString thai_nonnum_rexp = "[\\u0e01-\\u0e4f\\u0e5a-\\u0e7f]+";
+
void usage() {
const char *progname = "st-icu";
@@ -27,11 +34,11 @@ void usage() {
"returns 0 on succes, or non-zero otherwise" << endl << endl;
}
-// return true if string contains any thai unicode
-bool contains_thai(const UnicodeString &s) {
+// return true if string contains some regexp
+bool matches_regexp(const UnicodeString &s, const UnicodeString ®exp) {
UErrorCode status = U_ZERO_ERROR;
- // matches one or more thai chars, \u0e01-\u0e5b should work too
- RegexMatcher *matcher = new RegexMatcher("[\u0e00-\u0e7f]+", 0, status);
+
+ RegexMatcher *matcher = new RegexMatcher(regexp, 0, status);
if (U_FAILURE(status)) {
// syntax errors in the regular expression
@@ -46,11 +53,36 @@ bool contains_thai(const UnicodeString &
return false;
}
+// add spaces to string with thai numbers
+UnicodeString space_thai_numbers(const UnicodeString &s) {
+ // return string unmodified if no numbers
+ if ( ! matches_regexp(s, thai_num_rexp) ) {
+ return s;
+ }
+
+ UnicodeString rs;
+ UChar32 pch;
+ // add spaces between number and non-number
+ for (int i = 0 ; i < s.length(); i++) {
+ if ( u_isWhitespace(s[i]) ) {
+ rs += s[i];
+ } else if ((u_isdigit(s[i]) && !u_isdigit(pch) && matches_regexp(pch, thai_rexp)) ||
+ (u_isdigit(pch) && !u_isdigit(s[i]) && matches_regexp(s[i], thai_rexp))) {
+ rs += " ";
+ rs += s[i];
+ } else {
+ rs += s[i];
+ }
+ pch = s[i];
+ }
+ return rs;
+}
+
// split a unicode string by word boundaries. if arg contains
// whitespaces, it will get consolidated to single spaces.
// if string has no thai characters, return it unmodified
UnicodeString split_words_consolidated(const UnicodeString &s) {
- if ( ! contains_thai(s) ) {
+ if ( ! matches_regexp(s, thai_rexp) ) {
return s;
}
@@ -108,6 +140,8 @@ UnicodeString split_words(const UnicodeS
}
if ( tempStr.length() > 0 )
rs += split_words_consolidated(tempStr);
+
+ rs = space_thai_numbers(rs);
return rs;
}
Index: pkgsrc/textproc/split-thai/files/st-swath
diff -u pkgsrc/textproc/split-thai/files/st-swath:1.1 pkgsrc/textproc/split-thai/files/st-swath:1.2
--- pkgsrc/textproc/split-thai/files/st-swath:1.1 Thu Aug 13 20:52:09 2020
+++ pkgsrc/textproc/split-thai/files/st-swath Mon Aug 17 17:43:15 2020
@@ -6,6 +6,7 @@
# swath settings are split with ' ', longest match, unicode input, and
# unicode output. see swath(1)
#
+swath_cmd=ST_SHARE_BIN/swath
# use merged dictionary unless specified otherwise
if [ -z "$SWATHDICT" ]; then
@@ -16,12 +17,12 @@ if [ "$#" -eq 0 ]; then
# no args, read from stdin
while read line
do
- echo "$line" | swath -b ' ' -m long -u 'u,u' $dictarg
+ echo "$line" | $swath_cmd -b ' ' -m long -u 'u,u' $dictarg
done < /dev/stdin
exit 0
elif [ "$#" -eq 1 -a -e "$1" ]; then
# one arg and arg is an existing file
- swath -b ' ' -m long -u 'u,u' $dictarg < "$1"
+ $swath_cmd -b ' ' -m long -u 'u,u' $dictarg < "$1"
exit $?
elif [ "$#" -ge 1 ]; then
# one or more args, assume it is all text
@@ -34,7 +35,7 @@ elif [ "$#" -ge 1 ]; then
shift
done
- echo "$txt" | swath -b ' ' -m long -u 'u,u' $dictarg
+ echo "$txt" | $swath_cmd -b ' ' -m long -u 'u,u' $dictarg
exit $?
else
echo "$0: error parsing args"
Home |
Main Index |
Thread Index |
Old Index