tech-userlevel archive

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]

change proposal: nvi behavior for multi-width character



Hi,

I'm planning to change current behaviors of nvi for multi-width
characters in accordance with nvi-m17n written by itojun.

Any suggestions or comments are welcomed, especially from users
who live in non-C locales :-).

(1) cursor position (nvi-cursor.patch)

This patch fixes cursor position when a multi-width character
does not fit in a line, and is located on the next line.

Also, when cursor indicates a multi-width character, put it on
the first column of the character, instead of the last column in
the current implementation. Otherwise, some terminal emulators
do not focus on the entire the character, the right-most column
instead.

(2) join command (nvi-join.patch)

This patch changes amount of white spaces inserted when lines
ending or beginning with multi-width characters are joined:

  last char       first char      behavior
  ---             ---             ---
  multi-width     multi-width     nothing ins'ed
  multi-width     single-width    1 spc ins'ed
  single-width    multi-width     1 spc ins'ed
  single-width    single-width    original

This is (basically) the same behavior to nvi-m17n. As a Japanese,
I feel this is a quite reasonable choice, and I guess it may be
for other non-European languages that leave no space between
words.

(3) word-wise movement (not yet)

At the moment, word-wise movements do not work for languages
without space between words. It may never work unless we have
LC_COLLATE support in our libc. (Also, morphological analysis
would be required for full implementation for languages like
Japanese. However it is a quite different matter...)

Tentatively, I suggest to regard a change in character width as
a word boundary (not character length in byte, cf., characters
with umlaut symbols in UTF-8). How do you think of this?

rin
Index: dist/vi/vs_line.c
===================================================================
RCS file: /cvsroot/src/external/bsd/nvi/dist/vi/vs_line.c,v
retrieving revision 1.3
diff -u -r1.3 vs_line.c
--- dist/vi/vs_line.c	26 Jan 2014 21:43:45 -0000	1.3
+++ dist/vi/vs_line.c	6 Nov 2017 06:31:30 -0000
@@ -271,38 +271,64 @@
 
 	/* Do it the hard way, for leftright scrolling screens. */
 	if (O_ISSET(sp, O_LEFTRIGHT)) {
-		for (; offset_in_line < len; ++offset_in_line) {
-			chlen = (ch = (UCHAR_T)*p++) == L('\t') && !list_tab ?
+		 while (offset_in_line < len) {
+			ch = (UCHAR_T)*p;
+			chlen = (ch == '\t' && !list_tab) ?
 			    TAB_OFF(scno) : KEY_COL(sp, ch);
-			if ((scno += chlen) >= skip_cols)
-				break;
+
+			/* easy cases first. */
+			if (scno + chlen < skip_cols) {
+				scno += chlen;
+				p++;
+				offset_in_line++;
+				continue;
+			}
+
+			if (scno + chlen == skip_cols) {
+				scno += chlen;
+				p++;
+				offset_in_line++;
+			}
+
+			break;
 		}
 
 		/* Set cols_per_screen to 2nd and later line length. */
 		cols_per_screen = sp->cols;
 
 		/* Put starting info for this line in the cache. */
-		if (offset_in_line >= len) {
-			smp->c_sboff = offset_in_line;
-			smp->c_scoff = 255;
-		} else if (scno != skip_cols) {
-			smp->c_sboff = offset_in_line;
-			smp->c_scoff =
-			    offset_in_char = chlen - (scno - skip_cols);
-			--p;
-		} else {
-			smp->c_sboff = ++offset_in_line;
-			smp->c_scoff = 0;
-		}
+		smp->c_sboff = offset_in_line;
+		smp->c_scoff = offset_in_char = scno + chlen - skip_cols;
 	}
 
 	/* Do it the hard way, for historic line-folding screens. */
 	else {
-		for (; offset_in_line < len; ++offset_in_line) {
-			chlen = (ch = (UCHAR_T)*p++) == L('\t') && !list_tab ?
+		 while (offset_in_line < len) {
+			ch = (UCHAR_T)*p;
+			chlen = (ch == '\t' && !list_tab) ?
 			    TAB_OFF(scno) : KEY_COL(sp, ch);
-			if ((scno += chlen) < cols_per_screen)
+
+			/* Easy case first. */
+			if (scno + chlen < cols_per_screen) {
+				scno += chlen;
+				p++;
+				offset_in_line++;
 				continue;
+			}
+
+			/*
+			 * Since we can't generally cross the rightmost column
+			 * by displaying multi-width char, we must check it.
+			 * In that case, we fake the scno so that you'll see
+			 * that the line was already filled up completely.
+			 */
+			if (!INTISWIDE(ch) || scno + chlen == cols_per_screen) {
+				scno += chlen;
+				p++;
+				offset_in_line++;
+			} else
+				scno = cols_per_screen;
+
 			scno -= cols_per_screen;
 
 			/* Set cols_per_screen to 2nd and later line length. */
@@ -320,9 +346,10 @@
 		if (scno != 0) {
 			smp->c_sboff = offset_in_line;
 			smp->c_scoff = offset_in_char = chlen - scno;
-			--p;
+			offset_in_line--;
+			p--;
 		} else {
-			smp->c_sboff = ++offset_in_line;
+			smp->c_sboff = offset_in_line;
 			smp->c_scoff = 0;
 		}
 	}
@@ -334,10 +361,16 @@
 	 * called repeatedly with a valid pointer to a cursor position.
 	 * Don't fill anything in unless it's the right line and the right
 	 * character, and the right part of the character...
+	 *
+	 * It is not true that every wide chars occupy at least single column.
+	 * - It is safe to compare sp->cno and offset_in_line since they are
+	 *   both offset in unit of CHAR_T.
+	 * - We can't simply compare offset_in_line + cols_per_screen against
+	 *   sp->cno, since cols_per_screen is screen column, not offset in
+	 *   CHAR_T.  Do it slowly.
 	 */
 	if (yp == NULL ||
-	    smp->lno != sp->lno || sp->cno < offset_in_line ||
-	    offset_in_line + cols_per_screen < sp->cno) {
+	    smp->lno != sp->lno || sp->cno < offset_in_line) {
 		cno_cnt = 0;
 		/* If the line is on the screen, quit. */
 		if (is_cached || no_draw)
@@ -358,6 +391,23 @@
 		}
 
 		/*
+		 * Since we can't generally cross the rightmost column
+		 * by displaying multi-width char, we must check it.
+		 * In that case, we fake the scno so that you'll see
+		 * that the line was already filled up completely.
+		 */
+		if (INTISWIDE(ch) && scno > cols_per_screen) {
+			smp->c_ecsize = chlen;
+			smp->c_eclen = 0;
+
+			is_partial = 1;
+
+			smp->c_eboff = offset_in_line;
+
+			/* Terminate the loop. */
+			offset_in_line = len;
+		} else
+		/*
 		 * Only display up to the right-hand column.  Set a flag if
 		 * the entire character wasn't displayed for use in setting
 		 * the cursor.  If reached the end of the line, set the cache
@@ -400,6 +450,8 @@
 					*xp = scno - smp->c_ecsize;
 				else
 					*xp = scno - chlen;
+			else if (INTISWIDE(ch))
+				*xp = scno - chlen;
 			else
 				*xp = scno - 1;
 			if (O_ISSET(sp, O_NUMBER) &&
@@ -437,8 +489,8 @@
 			if (cbp + chlen >= ecbp)
 				FLUSH;
 
-			/* don't display half a wide character */
-			if (is_partial && CHAR_WIDTH(sp, ch) > 1) {
+			/* Don't display half a multi-width character */
+			if (is_partial && INTISWIDE(ch)) {
 				*cbp++ = ' ';
 				break;
 			}
@@ -458,7 +510,7 @@
 
 	if (scno < cols_per_screen) {
 		/* If didn't paint the whole line, update the cache. */
-		smp->c_ecsize = smp->c_eclen = KEY_LEN(sp, ch);
+		smp->c_ecsize = smp->c_eclen = KEY_COL(sp, ch);
 		smp->c_eboff = len - 1;
 
 		/*
Index: dist/vi/vs_refresh.c
===================================================================
RCS file: /cvsroot/src/external/bsd/nvi/dist/vi/vs_refresh.c,v
retrieving revision 1.6
diff -u -r1.6 vs_refresh.c
--- dist/vi/vs_refresh.c	26 Jan 2014 21:43:45 -0000	1.6
+++ dist/vi/vs_refresh.c	6 Nov 2017 06:31:30 -0000
@@ -148,7 +148,7 @@
 	SMAP *smp, tmp;
 	VI_PRIVATE *vip;
 	db_recno_t lastline, lcnt;
-	size_t cwtotal, cnt, len, notused, off, y;
+	size_t cwtotal, cnt, len, notused, off, y, chlen;
 	int ch = 0, didpaint, isempty, leftright_warp;
 	CHAR_T *p;
 
@@ -467,17 +467,33 @@
 		/*
 		 * 7a: Cursor moved left.
 		 *
-		 * Point to the old character.  The old cursor position can
-		 * be past EOL if, for example, we just deleted the rest of
-		 * the line.  In this case, since we don't know the width of
-		 * the characters we traversed, we have to do it slowly.
+		 * The old cursor position can be past EOL if, for example,
+		 * we just deleted the rest of the line.  In this case, since
+		 * we don't know the width of the characters we traversed, we
+		 * have to do it slowly.
 		 */
-		p += OCNO;
-		cnt = (OCNO - CNO) + 1;
 		if (OCNO >= len)
 			goto slow;
 
 		/*
+		 * cwtotal acts as new value for SCNO.  Set cwtotal to the
+		 * first char for content on CNO byte, for ease handling of
+		 * wide characters.
+		 *
+		 * If the character we're stepping on lies across a screen
+		 * boundary, we have no hope to speed it up.  Do it slowly.
+		 */
+		p += OCNO;
+		if (INTISWIDE(ch = (UCHAR_T)*p))
+			cwtotal = SCNO;
+		else {
+			if (ch == '\t' || (chlen = KEY_LEN(sp, ch)) > SCNO + 1)
+				goto slow;
+			cwtotal = SCNO + 1 - chlen;
+		}
+		cnt = OCNO - CNO;
+
+		/*
 		 * Quick sanity check -- it's hard to figure out exactly when
 		 * we cross a screen boundary as we do in the cursor right
 		 * movement.  If cnt is so large that we're going to cross the
@@ -488,63 +504,87 @@
 
 		/*
 		 * Count up the widths of the characters.  If it's a tab
-		 * character, go do it the the slow way.
+		 * character, go do it the slow way.
 		 */
-		for (cwtotal = 0; cnt--; cwtotal += KEY_COL(sp, ch))
-			if ((ch = *(UCHAR_T *)p--) == '\t')
+		while (cnt--) {
+			if ((ch = (UCHAR_T)*--p) == '\t'
+			    || (chlen = KEY_COL(sp, ch)) > cwtotal)
 				goto slow;
+			cwtotal -= chlen;
+		}
 
 		/*
-		 * Decrement the screen cursor by the total width of the
-		 * characters minus 1.
-		 */
-		cwtotal -= 1;
-
-		/*
-		 * If we're moving left, and there's a wide character in the
+		 * If we're moving left, and there's a multi-width char in the
 		 * current position, go to the end of the character.
 		 */
-		if (KEY_COL(sp, ch) > 1)
-			cwtotal -= KEY_COL(sp, ch) - 1;
+		if (!INTISWIDE(ch) && (chlen = KEY_LEN(sp, ch)) > 1)
+			cwtotal += chlen - 1;
 
 		/*
-		 * If the new column moved us off of the current logical line,
-		 * calculate a new one.  If doing leftright scrolling, we've
-		 * moved off of the current screen, as well.
+		 * At last, update the screen cursor.
 		 */
-		if (SCNO < cwtotal)
-			goto slow;
-		SCNO -= cwtotal;
+		SCNO = cwtotal;
 	} else {
 		/*
 		 * 7b: Cursor moved right.
-		 *
-		 * Point to the first character to the right.
 		 */
-		p += OCNO + 1;
+		if (OCNO >= len)
+			goto slow;
+
+		/*
+		 * cwtotal acts as new value for SCNO.  Set cwtotal to the
+		 * first char for content on CNO byte, for ease handling
+		 * of wide characters.
+		 */
+		p += OCNO;
+		if (INTISWIDE(ch = (UCHAR_T)*p))
+			cwtotal = SCNO;
+		else
+			cwtotal = SCNO + 1 - KEY_LEN(sp, ch);
 		cnt = CNO - OCNO;
 
 		/*
 		 * Count up the widths of the characters.  If it's a tab
-		 * character, go do it the the slow way.  If we cross a
-		 * screen boundary, we can quit.
+		 * character, go do it the the slow way.
+		 *
+		 * If a multi-width char seems to occupy the screen boundary,
+		 * that will be pushed to the next line.  Adjust the cursor
+		 * in that case.
+		 *
+		 * If we cross a screen boundary, we can quit.
 		 */
-		for (cwtotal = SCNO; cnt--;) {
-			if ((ch = *(UCHAR_T *)p++) == '\t')
+		while (cnt) {
+			if (ch == '\t')
 				goto slow;
-			if ((cwtotal += KEY_COL(sp, ch)) >= SCREEN_COLS(sp))
+			cwtotal += KEY_COL(sp, ch);
+			cnt--;
+			if (INTISWIDE(ch = (UCHAR_T)*++p)
+			    && (chlen = CHAR_WIDTH(sp, ch)) > 1
+			    && cwtotal + chlen >= SCREEN_COLS(sp))
+				cwtotal = SCREEN_COLS(sp);
+			if (cwtotal >= SCREEN_COLS(sp))
 				break;
 		}
 
 		/*
-		 * Increment the screen cursor by the total width of the
-		 * characters.
+		 * If we are on the tab character, we must do it slowly.
+		 *
+		 * If we're on a multi-width character in the current position,
+		 * go to the end of the character.
 		 */
-		SCNO = cwtotal;
+		if (ch == '\t')
+			goto slow;
+		if (!INTISWIDE(ch) && (chlen = KEY_LEN(sp, ch)) > 1)
+			cwtotal += chlen - 1;
 
 		/* See screen change comment in section 6a. */
-		if (SCNO >= SCREEN_COLS(sp))
+		if (cwtotal >= SCREEN_COLS(sp))
 			goto slow;
+
+		/*
+		 * At last, update the screen cursor.
+		 */
+		SCNO = cwtotal;
 	}
 
 	/*
@@ -678,6 +718,8 @@
 	}
 #else
 	if (vip->sc_smap == NULL) {
+		if (F_ISSET(sp, SC_SCR_REFORMAT))
+			abort(); /* XXX */
 		F_SET(sp, SC_SCR_REFORMAT);
 		return (vs_paint(sp, flags));
 	}
Index: dist/vi/vs_relative.c
===================================================================
RCS file: /cvsroot/src/external/bsd/nvi/dist/vi/vs_relative.c,v
retrieving revision 1.3
diff -u -r1.3 vs_relative.c
--- dist/vi/vs_relative.c	26 Jan 2014 21:43:45 -0000	1.3
+++ dist/vi/vs_relative.c	6 Nov 2017 06:31:30 -0000
@@ -162,22 +162,84 @@
 			curoff -= sp->cols;				\
 	}								\
 }
-	if (cnop == NULL)
-		while (len--) {
-			chlen = CHLEN(curoff);
+	if (cnop == NULL) {
+		while (len > 0) {
+			ch = (UCHAR_T)*p;
+
+			/* singlebyte case */
+			if (!INTISWIDE(ch)) {
+				chlen = CHLEN(curoff);
+				last = scno;
+				scno += chlen;
+				len--;
+				/* p will be modified in CHLEN() */
+				TAB_RESET;
+				continue;
+			}
+
+			/* multibyte case */
+			chlen = CHAR_WIDTH(sp, ch);
 			last = scno;
 			scno += chlen;
-			TAB_RESET;
+			len--;
+			p++;
+
+			/*
+			 * If multi-width char crosses the end-of-screen,
+			 * put it on the next line.
+			 */
+			curoff += chlen;
+			if (!leftright && curoff >= sp->cols) {
+				if (curoff == sp->cols)
+					curoff = 0;
+				else {
+					scno -= scno % sp->cols;
+					scno += chlen;
+					curoff = chlen;
+				}
+			}
 		}
-	else
+	} else {
 		for (cno = *cnop;; --cno) {
-			chlen = CHLEN(curoff);
+			ch = (UCHAR_T)*p;
+
+			/* singlebyte case */
+			if (!INTISWIDE(ch)) {
+				chlen = CHLEN(curoff);
+				last = scno;
+				scno += chlen;
+				/* p will be modified in CHLEN() */
+				TAB_RESET;
+				if (cno == 0)
+					break;
+				continue;
+			}
+
+			/* multibyte case */
+			chlen = CHAR_WIDTH(sp, ch);
 			last = scno;
 			scno += chlen;
-			TAB_RESET;
+			p++;
+
+			/*
+			 * If multi-width char crosses the end-of-screen,
+			 * put it on the next line.
+			 */
+			curoff += chlen;
+			if (!leftright && curoff >= sp->cols) {
+				if (curoff == sp->cols)
+					curoff = 0;
+				else {
+					scno -= scno % sp->cols;
+					scno += chlen;
+					curoff = chlen;
+				}
+			}
+
 			if (cno == 0)
 				break;
 		}
+	}
 
 	/* Add the trailing '$' if the O_LIST option set. */
 	if (listset && cnop == NULL)
@@ -249,8 +311,45 @@
 	off = cno / sp->cols;
 	cno %= sp->cols;
 	for (scno = 0, p = lp, len = llen; off--;) {
-		for (; len && scno < sp->cols; --len)
-			scno += CHLEN(scno);
+		while (len && scno < sp->cols) {
+			ch = (UCHAR_T)*p;
+			if (ch == '\t' && !listset) {
+				scno += TAB_OFF(scno);
+				len--;
+				p++;
+				continue;
+			}
+
+			chlen = KEY_COL(sp, ch);
+			if (!INTISWIDE(ch) || scno + chlen < sp->cols) {
+				/*
+				 * Singlebyte char can be displayed across
+				 * the end-of-screen.
+				 * If a multi-width char fits into this line,
+				 * put it here.
+				 */
+				scno += chlen;
+				len--;
+				p++;
+			} else if (leftright) {
+				/*
+				 * Side-scrolling screen is similar to
+				 * singlebyte case.
+				 */
+				scno += chlen;
+				len--;
+				p++;
+			} else {
+				/*
+				 * If multi-width char crosses the
+				 * end-of-screen, put it on the next line.
+				 *
+				 * We must adjust ch to the last char of the
+				 * line.
+				 */
+				scno = sp->cols;
+			}
+		}
 
 		/*
 		 * If reached the end of the physical line, return the last
Index: dist/ex/ex_join.c
===================================================================
RCS file: /cvsroot/src/external/bsd/nvi/dist/ex/ex_join.c,v
retrieving revision 1.3
diff -u -r1.3 ex_join.c
--- dist/ex/ex_join.c	26 Jan 2014 21:43:45 -0000	1.3
+++ dist/ex/ex_join.c	6 Nov 2017 06:31:30 -0000
@@ -109,7 +109,30 @@
 		 */
 		extra = 0;
 		if (!first && !FL_ISSET(cmdp->iflags, E_C_FORCE)) {
-			if (ISBLANK(echar))
+			/*
+			 * Here we implement behavior just based on nvi-m17n.
+			 *	last char	first char	behavior
+			 *	---		---		---
+			 *	multi-width	multi-width	nothing ins'ed
+			 *	multi-width	single-width	1 spc ins'ed
+			 *	single-width	multi-width	1 spc ins'ed
+			 *	single-width	single-width	original
+			 */
+			if (INTISWIDE(echar) && CHAR_WIDTH(sp, echar) > 1) {
+				if (INTISWIDE(p[0])
+				    && CHAR_WIDTH(sp, p[0]) > 1) {
+					; /* nothing */
+				} else {
+					*tbp++ = ' ';
+					++clen;
+					for (; len && ISBLANK((UCHAR_T)*p);
+					    --len, ++p);
+				}
+			} else if (INTISWIDE(p[0])
+				   && CHAR_WIDTH(sp, p[0]) > 1) {
+				*tbp++ = ' ';
+				++clen;
+			} else if (ISBLANK(echar))
 				for (; len && ISBLANK((UCHAR_T)*p); --len, ++p);
 			else if (p[0] != ')') {
 				if (STRCHR(L(".?!"), echar)) {


Home | Main Index | Thread Index | Old Index