[pkgsrc/trunk]: pkgsrc/textproc/split-thai Update to 0.4

To: source-changes-hg%NetBSD.org@localhost
Subject: [pkgsrc/trunk]: pkgsrc/textproc/split-thai Update to 0.4
From: scole <scole%pkgsrc.org@localhost>
Date: Mon, 17 Aug 2020 18:39:44 +0000
details:   https://anonhg.NetBSD.org/pkgsrc/rev/845af97f16b5
branches:  trunk
changeset: 437144:845af97f16b5
user:      scole <scole%pkgsrc.org@localhost>
date:      Mon Aug 17 17:43:15 2020 +0000

description:
Update to 0.4
- always use pkgsrc path for swath for st-swath script
- make splitting of numbers a little more consistent for st-emacs & st-icu
- add split-thai, split-thai-line, wrapper functions to emacs lisp code

diffstat:

 textproc/split-thai/Makefile              |   8 +++--
 textproc/split-thai/files/README.txt      |   2 +-
 textproc/split-thai/files/st-emacs        |   4 +-
 textproc/split-thai/files/st-icu.cc       |  44 +++++++++++++++++++++++++++---
 textproc/split-thai/files/st-swath        |   7 ++--
 textproc/split-thai/files/thai-utility.el |  31 +++++++++++++++++++--
 6 files changed, 78 insertions(+), 18 deletions(-)

diffs (221 lines):

diff -r 6b59a2eb1cb8 -r 845af97f16b5 textproc/split-thai/Makefile
--- a/textproc/split-thai/Makefile      Mon Aug 17 15:23:28 2020 +0000
+++ b/textproc/split-thai/Makefile      Mon Aug 17 17:43:15 2020 +0000
@@ -1,6 +1,6 @@
-# $NetBSD: Makefile,v 1.3 2020/08/15 16:52:28 scole Exp $
+# $NetBSD: Makefile,v 1.4 2020/08/17 17:43:15 scole Exp $
 
-PKGNAME=       split-thai-0.3
+PKGNAME=       split-thai-0.4
 CATEGORIES=    textproc
 MAINTAINER=    pkgsrc-users%NetBSD.org@localhost
 COMMENT=       Utilities to split UTF-8 Thai text into words
@@ -24,7 +24,8 @@
 UTF8_ENV=      env LC_ALL=C.UTF-8
 
 ST_SHARE_DIR=          share/split-thai
-INSTALLATION_DIRS=     bin ${ST_SHARE_DIR}
+ST_SHARE_BIN=          bin
+INSTALLATION_DIRS=     ${ST_SHARE_BIN} ${ST_SHARE_DIR}
 
 ST_SHARE_FILES=                README.txt thaidict thai-dict.el thai-dict.elc
 ST_SHARE_FILES+=       thai-utility.el thai-utility.elc thaidict.tri
@@ -41,6 +42,7 @@
 SUBST_MESSAGE.dictionary-app=  Fixing dictionary paths.
 SUBST_FILES.dictionary-app=    st-emacs st-swath
 SUBST_SED.dictionary-app=      -e 's,ST_SHARE_DIR,${PREFIX}/${ST_SHARE_DIR},g'
+SUBST_SED.dictionary-app+=     -e 's,ST_SHARE_BIN,${PREFIX}/${ST_SHARE_BIN},g'
 
 pre-extract:
        mkdir -p ${WRKSRC}
diff -r 6b59a2eb1cb8 -r 845af97f16b5 textproc/split-thai/files/README.txt
--- a/textproc/split-thai/files/README.txt      Mon Aug 17 15:23:28 2020 +0000
+++ b/textproc/split-thai/files/README.txt      Mon Aug 17 17:43:15 2020 +0000
@@ -66,5 +66,5 @@
 
 BUGS
      st-icu should also use the combined dictionary words.
-     st-emacs and st-icu don't always split thai numbers well.
+     thai text mixed with other languages may not be handled well.
      this file should be converted to a proper manpage.
diff -r 6b59a2eb1cb8 -r 845af97f16b5 textproc/split-thai/files/st-emacs
--- a/textproc/split-thai/files/st-emacs        Mon Aug 17 15:23:28 2020 +0000
+++ b/textproc/split-thai/files/st-emacs        Mon Aug 17 17:43:15 2020 +0000
@@ -18,7 +18,7 @@
   (with-temp-buffer
     (insert line)
     (goto-char (point-min))
-    (thai-break-words " ")
+    (split-thai-line)
     (buffer-string)))
 
 ;; hack to process stdin
@@ -48,6 +48,6 @@
     (insert (mapconcat 'identity (cdddr command-line-args) " "))
     (insert "\n"))
   (goto-char (point-min))
-  (thai-break-words " ")
+  (split-thai)
   (write-region nil nil "/dev/stdout"))
 (kill-emacs 0)
diff -r 6b59a2eb1cb8 -r 845af97f16b5 textproc/split-thai/files/st-icu.cc
--- a/textproc/split-thai/files/st-icu.cc       Mon Aug 17 15:23:28 2020 +0000
+++ b/textproc/split-thai/files/st-icu.cc       Mon Aug 17 17:43:15 2020 +0000
@@ -13,6 +13,13 @@
 using namespace std;
 using namespace icu;
 
+// utf-8 unicode thai values
+// 0x0e1 - 0x0e5b should work for thai_rexp as well...
+const UnicodeString thai_rexp = "[\\u0e00-\\u0e7f]+";
+const UnicodeString thai_consonant = "[\\u0e01-\\u0e2e]+";
+const UnicodeString thai_num_rexp = "[\\u0e50-\\u0e59]+";
+const UnicodeString thai_nonnum_rexp = "[\\u0e01-\\u0e4f\\u0e5a-\\u0e7f]+";
+
 void usage() {
  const char *progname = "st-icu";
        
@@ -27,11 +34,11 @@
      "returns 0 on succes, or non-zero otherwise" << endl << endl;
 }
 
-// return true if string contains any thai unicode
-bool contains_thai(const UnicodeString &s) {
+// return true if string contains some regexp
+bool matches_regexp(const UnicodeString &s, const UnicodeString &regexp) {
        UErrorCode status = U_ZERO_ERROR;
-       // matches one or more thai chars, \u0e01-\u0e5b should work too
-       RegexMatcher *matcher = new RegexMatcher("[\u0e00-\u0e7f]+", 0, status);
+
+       RegexMatcher *matcher = new RegexMatcher(regexp, 0, status);
 
        if (U_FAILURE(status)) {
                // syntax errors in the regular expression
@@ -46,11 +53,36 @@
                return false;
 }
 
+// add spaces to string with thai numbers
+UnicodeString space_thai_numbers(const UnicodeString &s) {
+       // return string unmodified if no numbers
+       if ( ! matches_regexp(s, thai_num_rexp) ) {
+               return s;
+       }
+
+       UnicodeString rs;
+       UChar32 pch;
+       // add spaces between number and non-number
+       for (int i = 0 ; i < s.length(); i++) {
+               if ( u_isWhitespace(s[i]) ) {
+                       rs += s[i];
+               } else if ((u_isdigit(s[i]) && !u_isdigit(pch) && matches_regexp(pch, thai_rexp)) ||
+                          (u_isdigit(pch) && !u_isdigit(s[i]) && matches_regexp(s[i], thai_rexp))) {
+                       rs += " ";
+                       rs += s[i];
+               } else {
+                       rs += s[i];
+               }
+               pch = s[i];
+       }
+       return rs;
+}
+
 // split a unicode string by word boundaries.  if arg contains
 // whitespaces, it will get consolidated to single spaces.
 // if string has no thai characters, return it unmodified
 UnicodeString split_words_consolidated(const UnicodeString &s) {
-       if ( ! contains_thai(s) ) {
+       if ( ! matches_regexp(s, thai_rexp) ) {
                return s;
        }
        
@@ -108,6 +140,8 @@
        }
        if ( tempStr.length() > 0 )
                rs += split_words_consolidated(tempStr);
+
+       rs = space_thai_numbers(rs);
        return rs;
 }
 
diff -r 6b59a2eb1cb8 -r 845af97f16b5 textproc/split-thai/files/st-swath
--- a/textproc/split-thai/files/st-swath        Mon Aug 17 15:23:28 2020 +0000
+++ b/textproc/split-thai/files/st-swath        Mon Aug 17 17:43:15 2020 +0000
@@ -6,6 +6,7 @@
 # swath settings are split with ' ', longest match, unicode input, and
 # unicode output.  see swath(1)
 #
+swath_cmd=ST_SHARE_BIN/swath
 
 # use merged dictionary unless specified otherwise
 if [ -z "$SWATHDICT" ]; then
@@ -16,12 +17,12 @@
     # no args, read from stdin
     while read line
     do
-       echo "$line" | swath -b ' ' -m long -u 'u,u' $dictarg
+       echo "$line" | $swath_cmd -b ' ' -m long -u 'u,u' $dictarg
     done < /dev/stdin
     exit 0 
 elif [ "$#" -eq 1 -a -e "$1" ]; then
     # one arg and arg is an existing file
-    swath -b ' ' -m long -u 'u,u' $dictarg < "$1"
+    $swath_cmd -b ' ' -m long -u 'u,u' $dictarg < "$1"
     exit $?
 elif [ "$#" -ge 1 ]; then
     # one or more args, assume it is all text
@@ -34,7 +35,7 @@
 
        shift
     done
-    echo "$txt" | swath -b ' ' -m long -u 'u,u' $dictarg
+    echo "$txt" | $swath_cmd -b ' ' -m long -u 'u,u' $dictarg
     exit $?
 else
     echo "$0: error parsing args"
diff -r 6b59a2eb1cb8 -r 845af97f16b5 textproc/split-thai/files/thai-utility.el
--- a/textproc/split-thai/files/thai-utility.el Mon Aug 17 15:23:28 2020 +0000
+++ b/textproc/split-thai/files/thai-utility.el Mon Aug 17 17:43:15 2020 +0000
@@ -168,15 +168,38 @@
       (write-region nil nil lispfile))
     line_count))
 
-(defun split-thai-line(&optional separator)
+(defun split-thai-line()
   "Break Thai words from point to end of line by inserting a
 separator string at word boundaries. (wrapper for 'thai-break-words)"
   (interactive)
-    (thai-break-words (or separator " ") (line-end-position)))
+  (thai-break-words " " (line-end-position))
+  (split-thai-numbers (point) (line-end-position)))
 
-(defun split-thai(&optional separator)
+(defun split-thai()
   "Break Thai words from point to end of buffer by inserting a
 separator string at word boundaries. (wrapper for
 'thai-break-words)"
   (interactive)
-    (thai-break-words (or separator " ") (point-max)))
+  (thai-break-words " " (point-max))
+  (split-thai-numbers (point) (point-max)))
+
+(defun split-thai-numbers(start_point end_point)
+  "helper function to separate numbers in a buffer.
+'thai-break-words doesn't always split numbers properly. this may
+improve tokenization somewhat."
+  ;; xxx this really should be fixed in 'thai-word lib
+  (let* (
+        ;; "\\([๐๑๒๓๔๕๖๗๘๙0123456789]+\\)"
+        (num_rexp "\\([\u0e50-\u0e59]+\\)") ;; thai numbers
+        (nonnum_rexp "\\([\u0e00-\u0e4f\u0e5a-\u0e7f]\\)") ;; "non-numbers"
+        (trailing_rexp (concat num_rexp nonnum_rexp))
+        (leading_rexp (concat nonnum_rexp num_rexp)))
+    (save-restriction
+      (narrow-to-region start_point end_point)
+      (goto-char (point-min))
+      (while (search-forward-regexp trailing_rexp nil t)
+       (replace-match (concat (match-string 1) " " (match-string 2))))
+      (goto-char (point-min))
+      (while (search-forward-regexp leading_rexp nil t)
+       (replace-match (concat (match-string 1) " " (match-string 2))))
+      (goto-char start_point))))
Prev by Date: [src/trunk]: src/usr.bin/tsort be host friendly
Next by Date: [pkgsrc/trunk]: pkgsrc/doc doc: Updated textproc/split-thai to 0.4
Previous by Thread: [src/trunk]: src/usr.bin/tsort be host friendly
Next by Thread: [pkgsrc/trunk]: pkgsrc/doc doc: Updated textproc/split-thai to 0.4
Indexes:
Home | Main Index | Thread Index | Old Index