pkgsrc-Changes-HG archive
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]
[pkgsrc/trunk]: pkgsrc/textproc/split-thai Update to version 2.0
details: https://anonhg.NetBSD.org/pkgsrc/rev/126bc5e3616a
branches: trunk
changeset: 448950:126bc5e3616a
user: scole <scole%pkgsrc.org@localhost>
date: Thu Mar 18 17:53:41 2021 +0000
description:
Update to version 2.0
- add new emacs module pthai.el, merging all emacs lisp code into it
- rename files thaidict* to words*
- modify other scripts to use new file names
diffstat:
textproc/split-thai/DESCR | 5 +-
textproc/split-thai/Makefile | 47 +-
textproc/split-thai/PLIST | 12 +-
textproc/split-thai/files/README.txt | 9 +
textproc/split-thai/files/pthai.el | 1848 +++++++++++++++++++++++++++++
textproc/split-thai/files/sampledict.txt | 45 +
textproc/split-thai/files/st-emacs | 17 +-
textproc/split-thai/files/st-swath | 2 +-
textproc/split-thai/files/st-wordbreak | 2 +-
textproc/split-thai/files/thai-utility.el | 228 ---
textproc/split-thai/files/thaidict.abm | 2 -
textproc/split-thai/files/words.abm | 2 +
12 files changed, 1946 insertions(+), 273 deletions(-)
diffs (truncated from 2376 to 300 lines):
diff -r 9dc44a0011f3 -r 126bc5e3616a textproc/split-thai/DESCR
--- a/textproc/split-thai/DESCR Thu Mar 18 17:32:32 2021 +0000
+++ b/textproc/split-thai/DESCR Thu Mar 18 17:53:41 2021 +0000
@@ -3,5 +3,6 @@
utilities use emacs, swath, perl, and a c++ icu-project program. All
use dictionary-based word splitting.
-Also included is a merged dictionary file of Thai words and a perl
-script to grep Thai UTF-8 words.
+Also included is a merged dictionary file of Thai words, a perl script
+to grep Thai UTF-8 words, and an emacs library that can split and play
+audio for Thai words.
diff -r 9dc44a0011f3 -r 126bc5e3616a textproc/split-thai/Makefile
--- a/textproc/split-thai/Makefile Thu Mar 18 17:32:32 2021 +0000
+++ b/textproc/split-thai/Makefile Thu Mar 18 17:53:41 2021 +0000
@@ -1,11 +1,11 @@
-# $NetBSD: Makefile,v 1.13 2020/11/05 09:09:15 ryoon Exp $
+# $NetBSD: Makefile,v 1.14 2021/03/18 17:53:41 scole Exp $
-PKGNAME= split-thai-1.1
-PKGREVISION= 1
+PKGNAME= split-thai-2.0
CATEGORIES= textproc
-MAINTAINER= pkgsrc-users%NetBSD.org@localhost
-COMMENT= Utilities to split UTF-8 Thai text into words
-LICENSE= public-domain AND mit AND gnu-gpl-v2 # code, icu dict, swath dict
+MAINTAINER= scole%NetBSD.org@localhost
+COMMENT= Utilities and an emacs library to split UTF-8 Thai text into words
+# pthai.el, other code, icu dict, swath dict
+LICENSE= 2-clause-bsd AND public-domain AND mit AND gnu-gpl-v2
# xxx fetching a specific version of a file out of a github project
EXTRACT_SUFX= # none
@@ -20,6 +20,7 @@
BUILD_DEPENDS+= libdatrie-[0-9]*:../../devel/libdatrie
DEPENDS+= emacs-[0-9]*:../../editors/emacs
DEPENDS+= swath-[0-9]*:../../textproc/swath
+DEPENDS+= mpg123-[0-9]*:../../audio/mpg123
REPLACE_PERL= st-wordbreak tgrep
REPLACE_SH= st-swath
@@ -30,8 +31,7 @@
ST_SHARE_BIN= bin
INSTALLATION_DIRS= ${ST_SHARE_BIN} ${ST_SHARE_DIR}
-ST_SHARE_FILES= README.txt thaidict thai-dict.el thai-dict.elc
-ST_SHARE_FILES+= thai-utility.el thai-utility.elc thaidict.tri
+ST_SHARE_FILES= README.txt pthai.el sampledict.txt words words.tri
# xxx REPLACE_EMACS_SCRIPT
SUBST_CLASSES+= st-emacs-app
@@ -41,39 +41,36 @@
SUBST_SED.st-emacs-app= -e 's,!/bin/emacs,!${PREFIX}/bin/emacs,g'
SUBST_CLASSES+= dictionary-app
-SUBST_STAGE.dictionary-app= pre-configure
+SUBST_STAGE.dictionary-app= post-extract
SUBST_MESSAGE.dictionary-app= Fixing dictionary paths.
-SUBST_FILES.dictionary-app= st-emacs st-swath st-wordbreak
+SUBST_FILES.dictionary-app= st-emacs st-swath st-wordbreak pthai.el
SUBST_SED.dictionary-app= -e 's,ST_SHARE_DIR,${PREFIX}/${ST_SHARE_DIR},g'
SUBST_SED.dictionary-app+= -e 's,ST_SHARE_BIN,${PREFIX}/${ST_SHARE_BIN},g'
pre-extract:
mkdir -p ${WRKSRC}
- cd files && cp README.txt st-emacs st-icu.cc st-swath \
- st-wordbreak tgrep thai-utility.el thaidict.abm ${WRKSRC}
+ cd files && cp README.txt pthai.el sampledict.txt \
+ st-emacs st-icu.cc st-swath st-wordbreak tgrep \
+ words.abm ${WRKSRC}
-post-extract:
+pre-build:
cd ${WRKSRC} && ${UTF8_ENV} emacs --batch \
- -f batch-byte-compile thai-utility.el
- cd ${WRKSRC} && ${UTF8_ENV} emacs --batch -l thai-utility.elc \
- --eval '(thai-word-table-save "emacs-dict")'
+ --eval='(setq pthai-bootstrap t)' \
+ --eval='(load-file "pthai.el")' \
+ --eval='(pthai-twt-table-save "thai-word-dict")'
cp ${WRKDIR}/${DISTFILES} ${WRKSRC}/icu-dict
cd ${PREFIX}/share/swath && \
${UTF8_ENV} trietool swathdic list | \
awk '{print $$1}' > ${WRKSRC}/swath-dict
cd ${WRKSRC} && \
- ${UTF8_ENV} cat icu-dict swath-dict emacs-dict | \
- grep -v '#' | sort | uniq > thaidict
+ ${UTF8_ENV} cat icu-dict swath-dict thai-word-dict | \
+ grep -v '#' | sort | uniq > words
cd ${WRKSRC} && \
- ${UTF8_ENV} trietool thaidict add-list -e utf-8 thaidict
- cd ${WRKSRC} && ${UTF8_ENV} emacs --batch -l thai-utility.elc \
- --eval '(thai-word-table-save-defvar "thaidict" "thai-dict.el")'
- cd ${WRKSRC} && ${UTF8_ENV} emacs --batch \
- -f batch-byte-compile thai-dict.el
-.for i in emacs-dict icu-dict swath-dict
+ ${UTF8_ENV} trietool words add-list -e utf-8 words
+.for i in thai-word-dict icu-dict swath-dict
@${ECHO} `wc -l ${WRKSRC}/${i} | awk '{print $$1}'` words in ${i}
.endfor
- @${ECHO} `wc -l ${WRKSRC}/thaidict | awk '{print $$1}'` \
+ @${ECHO} `wc -l ${WRKSRC}/words | awk '{print $$1}'` \
unique words in combined dictionary
do-build:
diff -r 9dc44a0011f3 -r 126bc5e3616a textproc/split-thai/PLIST
--- a/textproc/split-thai/PLIST Thu Mar 18 17:32:32 2021 +0000
+++ b/textproc/split-thai/PLIST Thu Mar 18 17:53:41 2021 +0000
@@ -1,13 +1,11 @@
-@comment $NetBSD: PLIST,v 1.4 2020/09/05 18:02:36 scole Exp $
+@comment $NetBSD: PLIST,v 1.5 2021/03/18 17:53:41 scole Exp $
bin/st-emacs
bin/st-icu
bin/st-swath
bin/st-wordbreak
bin/tgrep
share/split-thai/README.txt
-share/split-thai/thai-dict.el
-share/split-thai/thai-dict.elc
-share/split-thai/thai-utility.el
-share/split-thai/thai-utility.elc
-share/split-thai/thaidict
-share/split-thai/thaidict.tri
+share/split-thai/pthai.el
+share/split-thai/sampledict.txt
+share/split-thai/words
+share/split-thai/words.tri
diff -r 9dc44a0011f3 -r 126bc5e3616a textproc/split-thai/files/README.txt
--- a/textproc/split-thai/files/README.txt Thu Mar 18 17:32:32 2021 +0000
+++ b/textproc/split-thai/files/README.txt Thu Mar 18 17:53:41 2021 +0000
@@ -4,6 +4,7 @@
st-swath
st-wordbreak
tgrep
+ pthai.el
SYNOPSIS
st-emacs|st-icu|st-swath|st-wordbreak [filename|text1 text2 ...|'blank']
@@ -25,6 +26,9 @@
tgrep: grep-like utility using perl, see "tgrep -h"
+ pthai.el: emacs library for handling thai text in an emacs buffer,
+ including word splitting
+
EXAMPLES
split one or more text strings:
# st-swath แมวและหมา
@@ -74,8 +78,13 @@
icu4c/source/data/brkitr/dictionaries/thaidict.txt and then
rebuild the whole library.
+ Also included in this package is an emacs library called "pthai"
+ (practice-thai). It can do word splitting, play mp3 audio for
+ thai words and a few other things.
+
SEE ALSO
swath(1), libthai(1), emacs(1), locale(1), uconv(1), iconv(1)
+ trans(1) from pkgsrc/textproc/translate-shell
BUGS
st-icu should also use the combined dictionary words.
diff -r 9dc44a0011f3 -r 126bc5e3616a textproc/split-thai/files/pthai.el
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/textproc/split-thai/files/pthai.el Thu Mar 18 17:53:41 2021 +0000
@@ -0,0 +1,1848 @@
+;; Copyright (c) 2021 Sean Cole <scole%NetBSD.org@localhost>
+;; All rights reserved.
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions
+;; are met:
+;;
+;; 1. Redistributions of source code must retain the above copyright
+;; notice, this list of conditions and the following disclaimer.
+;; 2. Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;; TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+;; PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS
+;; BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+;; CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+;; SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+;; INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+;; CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+;; ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;; POSSIBILITY OF SUCH DAMAGE.
+
+;; utilities for working with thai text in emacs buffers. includes
+;; functions to split/unsplit thai strings, thai time conversion,
+;; download and play mp3 audio for thai words from thai-language.com,
+;; clickthai-online.com, and thai2english.com, and currency
+;; conversions from x-rates.com. Also, it can look up thai words if
+;; vocabulary files are created. The 'customize interface is available
+;; for a few settings.
+;;
+;; HOW TO USE
+;; - in ~/.emacs/init.el, add library path and a require, e.g.:
+;; (add-to-list 'load-path "ST_SHARE_DIR")
+;; (require 'pthai)
+;; or load module directly:
+;; (load-file "ST_SHARE_DIR/pthai.el")
+;;
+;; - also in ~/.emacs/init.el, possibly bind keys to some commonly
+;; used functions:
+;; (global-set-key [f8] 'pthai-lookup)
+;; (global-set-key [f9] 'pthai-say-word)
+;; (global-set-key [f10] 'pthai-say-line)
+;; - 'customize can be used to set or add paths to words lists and
+;; dictionaries
+;; - M-x apropos pthai for available functions
+;;
+;; TODO
+;; - make info file for this module
+;; - when splitting, handle unknown/misspelled words better
+;; - look up word definitions on the fly (?)
+;; - keep original spacing when possible when splitting (?)
+;; - interface with pkgsrc/textproc/translate-shell (?),
+;; not sure about licensing issues
+;; - don't try to download and say single letters in pthai-say except maybe ๆ. this
+;; happens sometimes when word misspelled or unknown words found
+;; - do better breaking of words with ๆ in dictionaries, like สั้นๆ
+;; - for pthai-rwb, maybe use non-brute-force/dynamic-programming algorithm
+;; - get byte compile working
+;; - get initial loading of default wordlist and sample dictionary working
+;; with customize*
+;; - create directories as needed through customize interface (?)
+;; pthai-default-directory ~/.emacs.d/pthai
+;; pthai-audio-directory ~/.emacs.d/pthai/audio or specified
+;; dictionaries ~/.emacs.d/pthai/dictionary or specified w/ pthai-dictionary-list
+;; wordlists ~/.emacs.d/pthai/wordlist or specified w/ pthai-wordlist-list
+
+;; known issues:
+;; - doesn't always handle "long" strings well
+;; - pthai-say-word after pthai-split line, last word is not always played.
+;; this seems to be emacs *shell* issue
+;; - when linting or compiling file, need to "(require 'seq)" first(?)
+;;
+(require 'cus-edit) ;; custom* customize*
+(require 'ido) ;; ido-completing-read*
+(require 'mule-util) ;; nested alist functions
+(require 'seq) ;; seq-* functions
+(require 'subr-x) ;; string-trim* functions
+(require 'thai-word) ;; thai-word-table
+(require 'thingatpt) ;; thing-at-point*
+(require 'url) ;; url-* functions
+
+;; xxx "special" vars to set before loading module, not the emacs way?
+(unless (boundp 'pthai-bootstrap)
+ (defvar pthai-bootstrap nil "nil unless building for pkgsrc"))
+
+(unless (boundp 'pthai-verbose-wordloads)
+ (defvar pthai-verbose-wordloads t
+ "if non-nil, display word counts when loading dictionaries or wordlists"))
+
+;; "normal" module variables
+(defvar pthai-default-directory (concat user-emacs-directory "pthai/")
+ "default pthai directory (ensure ends with directory separator)")
+
+(defvar pthai-wordlist (make-hash-table :test 'equal)
+ "hash table of thai words mapped to 1")
+
+(defvar pthai-dictionary (make-hash-table :test 'equal)
+ "thaiword => '( def eng_class thai_class where definition, eng_classifiers, thai_classifiers are all lists of strings. empty definitions should be defined as nil \"ไก่\" => ( '(\"chicken\") nil
'(\"ตัว\") )")
+
+(defvar pthai-misc-punctuation-regexp
+ (regexp-opt
+ (list "~" "`" "!" "@" "#" "\$" "%" "^" "&" "*" "(" ")"
+ "-" "_" "=" "+" "\\" "|" "{" "}" "[" "]"
+ ";" ":" "'" "\"" "<" ">" "." "," "/" "?"
+ "ๆ" "ฯาฯ" "ฯ" "฿" "๏" "๚" "๛"))
+ "regexp of misc punctuation used for word splitting")
+
+(defvar pthai-rwb-tmp nil "temporary variable for pthai-rwb")
+
+(defgroup pthai nil
+ "Pthai dictionary, wordlist, and word-splitting."
+ :group 'applications)
+
+(defcustom pthai-use-external-splitters t
+ "use external programs to help word splitting, which may be slower"
+ :group 'pthai
+ :type 'boolean)
+
+(defcustom pthai-split-mode "biggest"
+ "Type of word splitting"
+ :group 'pthai
+ :set (lambda (sym val) (set-default sym val))
+ :type '(radio (const :tag "biggest words possible/fewest words" :value "biggest")
+ (const :tag "smallest words possible/most words" :value "smallest")
+ (const :tag "interactively display choices" :value "interactive")))
+
+(defcustom pthai-mp3-player nil
+ "default command of audio player for mp3 files"
+ :group 'pthai
+ :type 'string)
+
Home |
Main Index |
Thread Index |
Old Index