pkgsrc-Changes-HG archive

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]

[pkgsrc/trunk]: pkgsrc/textproc/split-thai Update to version 2.0



details:   https://anonhg.NetBSD.org/pkgsrc/rev/126bc5e3616a
branches:  trunk
changeset: 448950:126bc5e3616a
user:      scole <scole%pkgsrc.org@localhost>
date:      Thu Mar 18 17:53:41 2021 +0000

description:
Update to version 2.0
- add new emacs module pthai.el, merging all emacs lisp code into it
- rename files thaidict* to words*
- modify other scripts to use new file names

diffstat:

 textproc/split-thai/DESCR                 |     5 +-
 textproc/split-thai/Makefile              |    47 +-
 textproc/split-thai/PLIST                 |    12 +-
 textproc/split-thai/files/README.txt      |     9 +
 textproc/split-thai/files/pthai.el        |  1848 +++++++++++++++++++++++++++++
 textproc/split-thai/files/sampledict.txt  |    45 +
 textproc/split-thai/files/st-emacs        |    17 +-
 textproc/split-thai/files/st-swath        |     2 +-
 textproc/split-thai/files/st-wordbreak    |     2 +-
 textproc/split-thai/files/thai-utility.el |   228 ---
 textproc/split-thai/files/thaidict.abm    |     2 -
 textproc/split-thai/files/words.abm       |     2 +
 12 files changed, 1946 insertions(+), 273 deletions(-)

diffs (truncated from 2376 to 300 lines):

diff -r 9dc44a0011f3 -r 126bc5e3616a textproc/split-thai/DESCR
--- a/textproc/split-thai/DESCR Thu Mar 18 17:32:32 2021 +0000
+++ b/textproc/split-thai/DESCR Thu Mar 18 17:53:41 2021 +0000
@@ -3,5 +3,6 @@
 utilities use emacs, swath, perl, and a c++ icu-project program.  All
 use dictionary-based word splitting.
 
-Also included is a merged dictionary file of Thai words and a perl
-script to grep Thai UTF-8 words.
+Also included is a merged dictionary file of Thai words, a perl script
+to grep Thai UTF-8 words, and an emacs library that can split and play
+audio for Thai words.
diff -r 9dc44a0011f3 -r 126bc5e3616a textproc/split-thai/Makefile
--- a/textproc/split-thai/Makefile      Thu Mar 18 17:32:32 2021 +0000
+++ b/textproc/split-thai/Makefile      Thu Mar 18 17:53:41 2021 +0000
@@ -1,11 +1,11 @@
-# $NetBSD: Makefile,v 1.13 2020/11/05 09:09:15 ryoon Exp $
+# $NetBSD: Makefile,v 1.14 2021/03/18 17:53:41 scole Exp $
 
-PKGNAME=       split-thai-1.1
-PKGREVISION=   1
+PKGNAME=       split-thai-2.0
 CATEGORIES=    textproc
-MAINTAINER=    pkgsrc-users%NetBSD.org@localhost
-COMMENT=       Utilities to split UTF-8 Thai text into words
-LICENSE=       public-domain AND mit AND gnu-gpl-v2 # code, icu dict, swath dict
+MAINTAINER=    scole%NetBSD.org@localhost
+COMMENT=       Utilities and an emacs library to split UTF-8 Thai text into words
+# pthai.el, other code, icu dict, swath dict
+LICENSE=       2-clause-bsd AND public-domain AND mit AND gnu-gpl-v2
 
 # xxx fetching a specific version of a file out of a github project
 EXTRACT_SUFX=  # none
@@ -20,6 +20,7 @@
 BUILD_DEPENDS+=        libdatrie-[0-9]*:../../devel/libdatrie
 DEPENDS+=      emacs-[0-9]*:../../editors/emacs
 DEPENDS+=      swath-[0-9]*:../../textproc/swath
+DEPENDS+=      mpg123-[0-9]*:../../audio/mpg123
 
 REPLACE_PERL=  st-wordbreak tgrep
 REPLACE_SH=    st-swath
@@ -30,8 +31,7 @@
 ST_SHARE_BIN=          bin
 INSTALLATION_DIRS=     ${ST_SHARE_BIN} ${ST_SHARE_DIR}
 
-ST_SHARE_FILES=                README.txt thaidict thai-dict.el thai-dict.elc
-ST_SHARE_FILES+=       thai-utility.el thai-utility.elc thaidict.tri
+ST_SHARE_FILES=                README.txt pthai.el sampledict.txt words words.tri
 
 # xxx REPLACE_EMACS_SCRIPT
 SUBST_CLASSES+=                        st-emacs-app
@@ -41,39 +41,36 @@
 SUBST_SED.st-emacs-app=                -e 's,!/bin/emacs,!${PREFIX}/bin/emacs,g'
 
 SUBST_CLASSES+=                        dictionary-app
-SUBST_STAGE.dictionary-app=    pre-configure
+SUBST_STAGE.dictionary-app=    post-extract
 SUBST_MESSAGE.dictionary-app=  Fixing dictionary paths.
-SUBST_FILES.dictionary-app=    st-emacs st-swath st-wordbreak
+SUBST_FILES.dictionary-app=    st-emacs st-swath st-wordbreak pthai.el
 SUBST_SED.dictionary-app=      -e 's,ST_SHARE_DIR,${PREFIX}/${ST_SHARE_DIR},g'
 SUBST_SED.dictionary-app+=     -e 's,ST_SHARE_BIN,${PREFIX}/${ST_SHARE_BIN},g'
 
 pre-extract:
        mkdir -p ${WRKSRC}
-       cd files && cp README.txt st-emacs st-icu.cc st-swath \
-               st-wordbreak tgrep thai-utility.el thaidict.abm ${WRKSRC}
+       cd files && cp README.txt pthai.el sampledict.txt \
+               st-emacs st-icu.cc st-swath st-wordbreak tgrep \
+               words.abm ${WRKSRC}
 
-post-extract:
+pre-build:
        cd ${WRKSRC} && ${UTF8_ENV} emacs --batch \
-               -f batch-byte-compile thai-utility.el
-       cd ${WRKSRC} && ${UTF8_ENV} emacs --batch -l thai-utility.elc \
-               --eval '(thai-word-table-save "emacs-dict")'
+               --eval='(setq pthai-bootstrap t)' \
+               --eval='(load-file "pthai.el")' \
+               --eval='(pthai-twt-table-save "thai-word-dict")'
        cp ${WRKDIR}/${DISTFILES} ${WRKSRC}/icu-dict
        cd ${PREFIX}/share/swath && \
                ${UTF8_ENV} trietool swathdic list | \
                awk '{print $$1}' > ${WRKSRC}/swath-dict
        cd ${WRKSRC} && \
-               ${UTF8_ENV} cat icu-dict swath-dict emacs-dict | \
-                       grep -v '#' | sort | uniq > thaidict
+               ${UTF8_ENV} cat icu-dict swath-dict thai-word-dict | \
+                       grep -v '#' | sort | uniq > words
        cd ${WRKSRC} && \
-               ${UTF8_ENV} trietool thaidict add-list -e utf-8 thaidict
-       cd ${WRKSRC} && ${UTF8_ENV} emacs --batch -l thai-utility.elc \
-               --eval '(thai-word-table-save-defvar "thaidict" "thai-dict.el")'
-       cd ${WRKSRC} && ${UTF8_ENV} emacs --batch \
-               -f batch-byte-compile thai-dict.el
-.for i in emacs-dict icu-dict swath-dict
+               ${UTF8_ENV} trietool words add-list -e utf-8 words
+.for i in thai-word-dict icu-dict swath-dict
        @${ECHO} `wc -l ${WRKSRC}/${i} | awk '{print $$1}'` words in ${i}
 .endfor
-       @${ECHO} `wc -l ${WRKSRC}/thaidict | awk '{print $$1}'` \
+       @${ECHO} `wc -l ${WRKSRC}/words | awk '{print $$1}'` \
                unique words in combined dictionary
 
 do-build:
diff -r 9dc44a0011f3 -r 126bc5e3616a textproc/split-thai/PLIST
--- a/textproc/split-thai/PLIST Thu Mar 18 17:32:32 2021 +0000
+++ b/textproc/split-thai/PLIST Thu Mar 18 17:53:41 2021 +0000
@@ -1,13 +1,11 @@
-@comment $NetBSD: PLIST,v 1.4 2020/09/05 18:02:36 scole Exp $
+@comment $NetBSD: PLIST,v 1.5 2021/03/18 17:53:41 scole Exp $
 bin/st-emacs
 bin/st-icu
 bin/st-swath
 bin/st-wordbreak
 bin/tgrep
 share/split-thai/README.txt
-share/split-thai/thai-dict.el
-share/split-thai/thai-dict.elc
-share/split-thai/thai-utility.el
-share/split-thai/thai-utility.elc
-share/split-thai/thaidict
-share/split-thai/thaidict.tri
+share/split-thai/pthai.el
+share/split-thai/sampledict.txt
+share/split-thai/words
+share/split-thai/words.tri
diff -r 9dc44a0011f3 -r 126bc5e3616a textproc/split-thai/files/README.txt
--- a/textproc/split-thai/files/README.txt      Thu Mar 18 17:32:32 2021 +0000
+++ b/textproc/split-thai/files/README.txt      Thu Mar 18 17:53:41 2021 +0000
@@ -4,6 +4,7 @@
      st-swath
      st-wordbreak
      tgrep
+     pthai.el
 
 SYNOPSIS
      st-emacs|st-icu|st-swath|st-wordbreak [filename|text1 text2 ...|'blank']
@@ -25,6 +26,9 @@
 
      tgrep:        grep-like utility using perl, see "tgrep -h"
 
+     pthai.el:     emacs library for handling thai text in an emacs buffer,
+                   including word splitting
+     
 EXAMPLES
       split one or more text strings:
       # st-swath แมวและหมา
@@ -74,8 +78,13 @@
      icu4c/source/data/brkitr/dictionaries/thaidict.txt and then
      rebuild the whole library.
 
+     Also included in this package is an emacs library called "pthai"
+     (practice-thai).  It can do word splitting, play mp3 audio for
+     thai words and a few other things.
+     
 SEE ALSO
      swath(1), libthai(1), emacs(1), locale(1), uconv(1), iconv(1)
+     trans(1) from pkgsrc/textproc/translate-shell
 
 BUGS
      st-icu should also use the combined dictionary words.
diff -r 9dc44a0011f3 -r 126bc5e3616a textproc/split-thai/files/pthai.el
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/textproc/split-thai/files/pthai.el        Thu Mar 18 17:53:41 2021 +0000
@@ -0,0 +1,1848 @@
+;; Copyright (c) 2021 Sean Cole <scole%NetBSD.org@localhost>
+;; All rights reserved.
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions
+;; are met:
+;;
+;; 1. Redistributions of source code must retain the above copyright
+;;    notice, this list of conditions and the following disclaimer.
+;; 2. Redistributions in binary form must reproduce the above copyright
+;;    notice, this list of conditions and the following disclaimer in the
+;;    documentation and/or other materials provided with the distribution.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;; TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+;; PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS
+;; BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+;; CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+;; SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+;; INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+;; CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+;; ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;; POSSIBILITY OF SUCH DAMAGE.
+
+;; utilities for working with thai text in emacs buffers.  includes
+;; functions to split/unsplit thai strings, thai time conversion,
+;; download and play mp3 audio for thai words from thai-language.com,
+;; clickthai-online.com, and thai2english.com, and currency
+;; conversions from x-rates.com.  Also, it can look up thai words if
+;; vocabulary files are created.  The 'customize interface is available
+;; for a few settings.
+;;
+;; HOW TO USE
+;; - in ~/.emacs/init.el, add library path and a require, e.g.:
+;;     (add-to-list 'load-path "ST_SHARE_DIR")
+;;     (require 'pthai)
+;;   or load module directly:
+;;     (load-file "ST_SHARE_DIR/pthai.el")
+;;
+;; - also in ~/.emacs/init.el, possibly bind keys to some commonly
+;;   used functions:
+;;     (global-set-key [f8] 'pthai-lookup)
+;;      (global-set-key [f9] 'pthai-say-word)
+;;      (global-set-key [f10] 'pthai-say-line)
+;; - 'customize can be used to set or add paths to words lists and
+;;   dictionaries
+;; - M-x apropos pthai for available functions
+;;
+;; TODO
+;; - make info file for this module
+;; - when splitting, handle unknown/misspelled words better
+;; - look up word definitions on the fly (?)
+;; - keep original spacing when possible when splitting (?)
+;; - interface with pkgsrc/textproc/translate-shell (?),
+;;   not sure about licensing issues
+;; - don't try to download and say single letters in pthai-say except maybe ๆ. this
+;;   happens sometimes when word misspelled or unknown words found
+;; - do better breaking of words with ๆ in dictionaries, like สั้นๆ
+;; - for pthai-rwb, maybe use non-brute-force/dynamic-programming algorithm
+;; - get byte compile working
+;; - get initial loading of default wordlist and sample dictionary working
+;;   with customize*
+;; - create directories as needed through customize interface (?)
+;;     pthai-default-directory    ~/.emacs.d/pthai
+;;     pthai-audio-directory      ~/.emacs.d/pthai/audio      or specified
+;;     dictionaries               ~/.emacs.d/pthai/dictionary or specified w/ pthai-dictionary-list
+;;     wordlists                  ~/.emacs.d/pthai/wordlist   or specified w/ pthai-wordlist-list
+
+;; known issues:
+;; - doesn't always handle "long" strings well
+;; - pthai-say-word after pthai-split line, last word is not always played.
+;;   this seems to be emacs *shell* issue
+;; - when linting or compiling file, need to "(require 'seq)" first(?)
+;;
+(require 'cus-edit)   ;; custom* customize*
+(require 'ido)        ;; ido-completing-read*
+(require 'mule-util)  ;; nested alist functions
+(require 'seq)        ;; seq-* functions   
+(require 'subr-x)     ;; string-trim* functions
+(require 'thai-word)  ;; thai-word-table
+(require 'thingatpt)  ;; thing-at-point*
+(require 'url)        ;; url-* functions
+
+;; xxx "special" vars to set before loading module, not the emacs way?
+(unless (boundp 'pthai-bootstrap)
+  (defvar pthai-bootstrap nil "nil unless building for pkgsrc"))
+
+(unless (boundp 'pthai-verbose-wordloads)
+  (defvar pthai-verbose-wordloads t
+    "if non-nil, display word counts when loading dictionaries or wordlists"))
+
+;; "normal" module variables
+(defvar pthai-default-directory (concat user-emacs-directory "pthai/")
+  "default pthai directory (ensure ends with directory separator)")
+
+(defvar pthai-wordlist (make-hash-table :test 'equal)
+  "hash table of thai words mapped to 1")
+
+(defvar pthai-dictionary (make-hash-table :test 'equal)
+    "thaiword => '( def eng_class thai_class where definition, eng_classifiers, thai_classifiers are all lists of strings.  empty definitions should be defined as nil \"ไก่\" => ( '(\"chicken\") nil 
'(\"ตัว\") )")
+
+(defvar pthai-misc-punctuation-regexp
+  (regexp-opt
+   (list "~" "`" "!" "@" "#" "\$" "%" "^" "&" "*" "(" ")"
+        "-" "_" "=" "+" "\\" "|" "{" "}" "[" "]"
+        ";" ":" "'" "\"" "<" ">" "." "," "/" "?"
+        "ๆ" "ฯาฯ" "ฯ" "฿" "๏" "๚" "๛"))
+  "regexp of misc punctuation used for word splitting")
+
+(defvar pthai-rwb-tmp nil "temporary variable for pthai-rwb") 
+
+(defgroup pthai nil
+  "Pthai dictionary, wordlist, and word-splitting."
+  :group 'applications)
+
+(defcustom pthai-use-external-splitters t
+  "use external programs to help word splitting, which may be slower"
+  :group 'pthai
+  :type 'boolean)
+
+(defcustom pthai-split-mode "biggest"
+  "Type of word splitting"
+  :group 'pthai
+  :set (lambda (sym val) (set-default sym val))
+  :type '(radio (const :tag "biggest words possible/fewest words" :value "biggest")
+               (const :tag "smallest words possible/most words" :value "smallest")
+               (const :tag "interactively display choices" :value "interactive")))
+
+(defcustom pthai-mp3-player nil
+  "default command of audio player for mp3 files"
+  :group 'pthai
+  :type 'string)
+


Home | Main Index | Thread Index | Old Index