[src/trunk]: src/usr.sbin/makemandb Add a custom tokenizer which does not ste...

To: source-changes-hg%NetBSD.org@localhost
Subject: [src/trunk]: src/usr.sbin/makemandb Add a custom tokenizer which does not ste...
From: abhinav <abhinav%NetBSD.org@localhost>
Date: Tue, 07 Apr 2020 08:20:45 +0000
details:   https://anonhg.NetBSD.org/src/rev/a9afb766883c
branches:  trunk
changeset: 824792:a9afb766883c
user:      abhinav <abhinav%NetBSD.org@localhost>
date:      Sun Jun 18 16:24:10 2017 +0000

description:
Add a custom tokenizer which does not stem certain keywords.

Which keywords should not be stemmed is specified in the nostem.txt file.
(Right now I have taken all the man page names, split them if they had
underscores, removed common English words and converted everything to
lowercase.)

The tokenizer itself is based on the Porter stemming tokenizer shipped with
Sqlite. The code in custom_apropos_tokenizer.c is copy of that code with
some modifications to prevent stemming keywords specified in nostem.txt.

Additionally, it now uses underscore `_' also as a token delimiter. Therefore,
now it's possible to do query for `lwp' and all `_lwp_*' man page names
will be matched. Or the query can be `unconst' and `__UNCONST' will be matched.
This was not possible earlier, because underscore was not a delimiter and therefore
the index would have __UNCONST as a key rather than UNCONST.

The tokenizer needs fts3_tokenizer.h file, which is not shipped with the
amalgamation build of Sqlite, therefore it needs to be added here (unless
we decide there is a better place for it).

To enforce using the new tokenizer, a schema version bump is needed

Since the tokenization is done both at the indexing time (via makemandb) and
also while query time (via apropos or whatis), it will be needed to bump
the schema version everytime nostem.txt is modified. Otherwise the
index will consist of old tokens and desired changes will not be seen with
apropos.

This should also fix the issue reported in PR bin/46255. Similar suggestion was
also made on tech-userlevel@ recently:
<http://mail-index.netbsd.org/tech-userlevel/2017/06/08/msg010620.html>

Thanks to christos@ for multiple rounds of reviews of the tokenizer code.

diffstat:

 usr.sbin/makemandb/Makefile                   |    19 +-
 usr.sbin/makemandb/apropos-utils.c            |    39 +-
 usr.sbin/makemandb/apropos-utils.h            |     4 +-
 usr.sbin/makemandb/custom_apropos_tokenizer.c |   754 +++
 usr.sbin/makemandb/custom_apropos_tokenizer.h |    38 +
 usr.sbin/makemandb/fts3_tokenizer.h           |   161 +
 usr.sbin/makemandb/nostem.txt                 |  4840 +++++++++++++++++++++++++
 7 files changed, 5843 insertions(+), 12 deletions(-)

diffs (truncated from 5955 to 300 lines):

diff -r 9030939f1727 -r a9afb766883c usr.sbin/makemandb/Makefile
--- a/usr.sbin/makemandb/Makefile       Sun Jun 18 15:57:16 2017 +0000
+++ b/usr.sbin/makemandb/Makefile       Sun Jun 18 16:24:10 2017 +0000
@@ -1,4 +1,4 @@
-# $NetBSD: Makefile,v 1.8 2017/05/21 15:28:43 riastradh Exp $
+# $NetBSD: Makefile,v 1.9 2017/06/18 16:24:10 abhinav Exp $
 
 .include <bsd.own.mk>
 
@@ -6,9 +6,9 @@
 MANCONFDIR=${NETBSDSRCDIR}/usr.bin/man
 
 PROGS=                 makemandb apropos whatis
-SRCS.makemandb=                makemandb.c apropos-utils.c manconf.c
-SRCS.apropos=  apropos.c apropos-utils.c manconf.c
-SRCS.whatis=   whatis.c apropos-utils.c manconf.c
+SRCS.makemandb=                makemandb.c apropos-utils.c manconf.c custom_apropos_tokenizer.c
+SRCS.apropos=  apropos.c apropos-utils.c manconf.c custom_apropos_tokenizer.c
+SRCS.whatis=   whatis.c apropos-utils.c manconf.c custom_apropos_tokenizer.c
 MAN.makemandb= makemandb.8
 MAN.apropos=   apropos.1
 MAN.whatis=    whatis.1
@@ -39,7 +39,14 @@
        echo '};'                                                       \
        ) > ${.TARGET}
 
-DPSRCS+=       stopwords.c
-CLEANFILES+=   stopwords.c
+nostem.c: nostem.txt
+       ( set -e; ${TOOL_NBPERF} -n nostem_hash -s -p ${.ALLSRC};       \
+       echo 'static const char *nostem[] = {';                 \
+       ${TOOL_SED} -e 's|^\(.*\)$$|    "\1",|' ${.ALLSRC};             \
+       echo '};'                                                       \
+       ) > ${.TARGET}
+
+DPSRCS+=       stopwords.c nostem.c
+CLEANFILES+=   stopwords.c nostem.c
 
 .include <bsd.prog.mk>
diff -r 9030939f1727 -r a9afb766883c usr.sbin/makemandb/apropos-utils.c
--- a/usr.sbin/makemandb/apropos-utils.c        Sun Jun 18 15:57:16 2017 +0000
+++ b/usr.sbin/makemandb/apropos-utils.c        Sun Jun 18 16:24:10 2017 +0000
@@ -1,4 +1,4 @@
-/*     $NetBSD: apropos-utils.c,v 1.37 2017/05/01 05:28:00 abhinav Exp $       */
+/*     $NetBSD: apropos-utils.c,v 1.38 2017/06/18 16:24:10 abhinav Exp $       */
 /*-
  * Copyright (c) 2011 Abhinav Upadhyay <er.abhinav.upadhyay%gmail.com@localhost>
  * All rights reserved.
@@ -31,7 +31,7 @@
  */
 
 #include <sys/cdefs.h>
-__RCSID("$NetBSD: apropos-utils.c,v 1.37 2017/05/01 05:28:00 abhinav Exp $");
+__RCSID("$NetBSD: apropos-utils.c,v 1.38 2017/06/18 16:24:10 abhinav Exp $");
 
 #include <sys/queue.h>
 #include <sys/stat.h>
@@ -50,7 +50,9 @@
 #undef tab     // XXX: manconf.h
 
 #include "apropos-utils.h"
+#include "custom_apropos_tokenizer.h"
 #include "manconf.h"
+#include "fts3_tokenizer.h"
 
 typedef struct orig_callback_data {
        void *data;
@@ -79,6 +81,28 @@
        1.00    //machine
 };
 
+static int
+register_tokenizer(sqlite3 *db)
+{
+       int rc;
+       sqlite3_stmt *stmt;
+       const sqlite3_tokenizer_module *p;
+       const char *name = "custom_apropos_tokenizer";
+       get_custom_apropos_tokenizer(&p);
+       const char *sql = "SELECT fts3_tokenizer(?, ?)";
+
+       sqlite3_db_config(db, SQLITE_DBCONFIG_ENABLE_FTS3_TOKENIZER, 1, 0);
+       rc = sqlite3_prepare_v2(db, sql, -1, &stmt, 0);
+       if (rc != SQLITE_OK)
+               return rc;
+
+       sqlite3_bind_text(stmt, 1, name, -1, SQLITE_STATIC);
+       sqlite3_bind_blob(stmt, 2, &p, sizeof(p), SQLITE_STATIC);
+       sqlite3_step(stmt);
+
+       return sqlite3_finalize(stmt);
+}
+
 /*
  * lower --
  *  Converts the string str to lower case
@@ -180,7 +204,7 @@
 #ifndef DEBUG          
                "compress=zip, uncompress=unzip, "
 #endif         
-               "tokenize=porter, notindexed=section, notindexed=md5_hash); "
+               "tokenize=custom_apropos_tokenizer, notindexed=section, notindexed=md5_hash); "
            //mandb_meta
            "CREATE TABLE IF NOT EXISTS mandb_meta(device, inode, mtime, "
                "file UNIQUE, md5_hash UNIQUE, id  INTEGER PRIMARY KEY); "
@@ -365,6 +389,14 @@
                goto error;
        }
 
+       sqlite3_extended_result_codes(db, 1);
+
+       rc = register_tokenizer(db);
+       if (rc != SQLITE_OK) {
+               warnx("Unable to register custom tokenizer: %s", sqlite3_errmsg(db));
+               goto error;
+       }
+
        if (create_db_flag && create_db(db) < 0) {
                warnx("%s", "Unable to create database schema");
                goto error;
@@ -390,7 +422,6 @@
        }
        sqlite3_finalize(stmt);
 
-       sqlite3_extended_result_codes(db, 1);
 
        /* Register the zip and unzip functions for FTS compression */
        rc = sqlite3_create_function(db, "zip", 1, SQLITE_ANY, NULL, zip,
diff -r 9030939f1727 -r a9afb766883c usr.sbin/makemandb/apropos-utils.h
--- a/usr.sbin/makemandb/apropos-utils.h        Sun Jun 18 15:57:16 2017 +0000
+++ b/usr.sbin/makemandb/apropos-utils.h        Sun Jun 18 16:24:10 2017 +0000
@@ -1,4 +1,4 @@
-/*     $NetBSD: apropos-utils.h,v 1.12 2017/05/01 05:28:00 abhinav Exp $       */
+/*     $NetBSD: apropos-utils.h,v 1.13 2017/06/18 16:24:10 abhinav Exp $       */
 /*-
  * Copyright (c) 2011 Abhinav Upadhyay <er.abhinav.upadhyay%gmail.com@localhost>
  * All rights reserved.
@@ -45,7 +45,7 @@
 } mandb_access_mode;
 
 
-#define APROPOS_SCHEMA_VERSION 20120507
+#define APROPOS_SCHEMA_VERSION 20170618
 
 /*
  * Used to identify the section of a man(7) page.
diff -r 9030939f1727 -r a9afb766883c usr.sbin/makemandb/custom_apropos_tokenizer.c
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/usr.sbin/makemandb/custom_apropos_tokenizer.c     Sun Jun 18 16:24:10 2017 +0000
@@ -0,0 +1,754 @@
+/*     $NetBSD: custom_apropos_tokenizer.c,v 1.1 2017/06/18 16:24:10 abhinav Exp $     */
+/*
+** 2006 September 30
+**
+** The author disclaims copyright to this source code.  In place of
+** a legal notice, here is a blessing:
+**
+**    May you do good and not evil.
+**    May you find forgiveness for yourself and forgive others.
+**    May you share freely, never taking more than you give.
+**
+*************************************************************************
+** Implementation of the full-text-search tokenizer that implements
+** a Porter stemmer.
+*/
+
+/*
+** The code in this file is only compiled if:
+**
+**     * The FTS3 module is being built as an extension
+**       (in which case SQLITE_CORE is not defined), or
+**
+**     * The FTS3 module is being built into the core of
+**       SQLite (in which case SQLITE_ENABLE_FTS3 is defined).
+*/
+
+#include <assert.h>
+#include <ctype.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "custom_apropos_tokenizer.h"
+#include "fts3_tokenizer.h"
+#include "nostem.c"
+
+/*
+ * Class derived from sqlite3_tokenizer
+ */
+typedef struct custom_apropos_tokenizer {
+       sqlite3_tokenizer base; /* Base class */
+} custom_apropos_tokenizer;
+
+/*
+ * Class derived from sqlite3_tokenizer_cursor
+ */
+typedef struct custom_apropos_tokenizer_cursor {
+       sqlite3_tokenizer_cursor base;
+       const char *zInput;     /* input we are tokenizing */
+       size_t nInput;          /* size of the input */
+       size_t iOffset;         /* current position in zInput */
+       size_t iToken;          /* index of next token to be returned */
+       char *zToken;           /* storage for current token */
+       size_t nAllocated;              /* space allocated to zToken buffer */
+} custom_apropos_tokenizer_cursor;
+
+/*
+ * Create a new tokenizer instance.
+ */
+static int
+aproposPorterCreate(int argc, const char *const * argv,
+    sqlite3_tokenizer ** ppTokenizer)
+{
+       custom_apropos_tokenizer *t;
+       t = calloc(1, sizeof(*t));
+       if (t == NULL)
+               return SQLITE_NOMEM;
+       *ppTokenizer = &t->base;
+       return SQLITE_OK;
+}
+
+/*
+ * Destroy a tokenizer
+ */
+static int 
+aproposPorterDestroy(sqlite3_tokenizer * pTokenizer)
+{
+       free(pTokenizer);
+       return SQLITE_OK;
+}
+
+/*
+ * Prepare to begin tokenizing a particular string.  The input
+ * string to be tokenized is zInput[0..nInput-1].  A cursor
+ * used to incrementally tokenize this string is returned in 
+ * *ppCursor.
+ */
+static int 
+aproposPorterOpen(
+    sqlite3_tokenizer * pTokenizer,    /* The tokenizer */
+    const char *zInput, int nInput,    /* String to be tokenized */
+    sqlite3_tokenizer_cursor ** ppCursor       /* OUT: Tokenization cursor */
+)
+{
+       custom_apropos_tokenizer_cursor *c;
+
+       c = calloc(1, sizeof(*c));
+       if (c == NULL)
+               return SQLITE_NOMEM;
+
+       c->zInput = zInput;
+       if (zInput != 0) {
+               if (nInput < 0)
+                       c->nInput = strlen(zInput);
+               else
+                       c->nInput = nInput;
+       }
+
+       *ppCursor = &c->base;
+       return SQLITE_OK;
+}
+
+/*
+ * Close a tokenization cursor previously opened by a call to
+ * aproposPorterOpen() above.
+ */
+static int 
+aproposPorterClose(sqlite3_tokenizer_cursor *pCursor)
+{
+       custom_apropos_tokenizer_cursor *c = (custom_apropos_tokenizer_cursor *) pCursor;
+       free(c->zToken);
+       free(c);
+       return SQLITE_OK;
+}
+
+/*
+ * Vowel or consonant
+ */
+static const char cType[] = {
+       0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0,
+       1, 1, 1, 2, 1
+};
+
+/*
+ * isConsonant() and isVowel() determine if their first character in
+ * the string they point to is a consonant or a vowel, according
+ * to Porter ruls.  
+ *
+ * A consonate is any letter other than 'a', 'e', 'i', 'o', or 'u'.
+ * 'Y' is a consonant unless it follows another consonant,
+ * in which case it is a vowel.
+ *
+ * In these routine, the letters are in reverse order.  So the 'y' rule
+ * is that 'y' is a consonant unless it is followed by another
+ * consonent.
+ */
+static int isVowel(const char*);
+
+static int 
+isConsonant(const char *z)
Prev by Date: [src/trunk]: src/sys/arch/evbarm/conf Build exynos5422 .dtb files with this k...
Next by Date: [src/trunk]: src/etc/pam.d Install racoon pam file.
Previous by Thread: [src/trunk]: src/sys/arch/evbarm/conf Build exynos5422 .dtb files with this k...
Next by Thread: [src/trunk]: src/etc/pam.d Install racoon pam file.
Indexes:
Home | Main Index | Thread Index | Old Index