Source-Changes-HG archive

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]

[src/trunk]: src Add support to msdosfs and makefs to generate correct Unicod...



details:   https://anonhg.NetBSD.org/src/rev/f1a4c8945086
branches:  trunk
changeset: 813485:f1a4c8945086
user:      mlelstv <mlelstv%NetBSD.org@localhost>
date:      Sat Jan 30 09:59:27 2016 +0000

description:
Add support to msdosfs and makefs to generate correct Unicode (UCS-2) directory
entries from UTF8 encoded file names.

diffstat:

 sbin/mount_msdos/mount_msdos.8         |    17 +-
 sbin/mount_msdos/mount_msdos.c         |     9 +-
 sys/fs/msdosfs/direntry.h              |    11 +-
 sys/fs/msdosfs/msdosfs_conv.c          |  1492 ++++++++++++++++++++++++++++---
 sys/fs/msdosfs/msdosfs_lookup.c        |    14 +-
 sys/fs/msdosfs/msdosfs_vnops.c         |     6 +-
 sys/fs/msdosfs/msdosfsmount.h          |     7 +-
 usr.sbin/makefs/msdos.c                |    43 +-
 usr.sbin/makefs/msdos/msdosfs_vfsops.c |     8 +-
 usr.sbin/makefs/msdos/msdosfs_vnops.c  |    11 +-
 10 files changed, 1402 insertions(+), 216 deletions(-)

diffs (truncated from 2028 to 300 lines):

diff -r 757513cc4d39 -r f1a4c8945086 sbin/mount_msdos/mount_msdos.8
--- a/sbin/mount_msdos/mount_msdos.8    Sat Jan 30 05:15:18 2016 +0000
+++ b/sbin/mount_msdos/mount_msdos.8    Sat Jan 30 09:59:27 2016 +0000
@@ -1,4 +1,4 @@
-.\" $NetBSD: mount_msdos.8,v 1.36 2012/11/16 15:00:18 tsutsui Exp $
+.\" $NetBSD: mount_msdos.8,v 1.37 2016/01/30 09:59:27 mlelstv Exp $
 .\"
 .\" Copyright (c) 1993, 1994 Christopher G. Demetriou
 .\" All rights reserved.
@@ -40,7 +40,7 @@
 .Nd mount an MS-DOS file system
 .Sh SYNOPSIS
 .Nm
-.Op Fl 9Gls
+.Op Fl 9GlsU
 .Op Fl g Ar gid
 .Op Fl M Ar mask
 .Op Fl m Ar mask
@@ -111,6 +111,19 @@
 Otherwise
 .Fl l
 is assumed.
+.It Fl U
+The MS-DOS file system stores filenames in a short
+version using 8-bit characters according to some
+character set and a long version with 16-bit unicode
+characters.
+The default method to store encoding-agnostic UNIX filenames
+is to copy them byte-wise into both fields. This is
+transparent but generates wrong unicode characters
+for anything that is not ASCII. Setting the
+.Fl U
+flag interprets UNIX filenames as UTF-8 and generates
+correctly encoded long filenames. This forces
+.Fl l .
 .It Fl M Ar mask
 Specify the maximum file permissions for directories
 in the file system.
diff -r 757513cc4d39 -r f1a4c8945086 sbin/mount_msdos/mount_msdos.c
--- a/sbin/mount_msdos/mount_msdos.c    Sat Jan 30 05:15:18 2016 +0000
+++ b/sbin/mount_msdos/mount_msdos.c    Sat Jan 30 09:59:27 2016 +0000
@@ -1,4 +1,4 @@
-/* $NetBSD: mount_msdos.c,v 1.47 2009/10/07 20:34:02 pooka Exp $ */
+/* $NetBSD: mount_msdos.c,v 1.48 2016/01/30 09:59:27 mlelstv Exp $ */
 
 /*
  * Copyright (c) 1994 Christopher G. Demetriou
@@ -36,7 +36,7 @@
 
 #include <sys/cdefs.h>
 #ifndef lint
-__RCSID("$NetBSD: mount_msdos.c,v 1.47 2009/10/07 20:34:02 pooka Exp $");
+__RCSID("$NetBSD: mount_msdos.c,v 1.48 2016/01/30 09:59:27 mlelstv Exp $");
 #endif /* not lint */
 
 #include <sys/param.h>
@@ -94,7 +94,7 @@
        *mntflags = set_gid = set_uid = set_mask = set_dirmask = set_gmtoff = 0;
        (void)memset(args, '\0', sizeof(*args));
 
-       while ((c = getopt(argc, argv, "Gsl9u:g:m:M:o:t:")) != -1) {
+       while ((c = getopt(argc, argv, "Gsl9Uu:g:m:M:o:t:")) != -1) {
                switch (c) {
                case 'G':
                        args->flags |= MSDOSFSMNT_GEMDOSFS;
@@ -108,6 +108,9 @@
                case '9':
                        args->flags |= MSDOSFSMNT_NOWIN95;
                        break;
+               case 'U':
+                       args->flags |= MSDOSFSMNT_UTF8;
+                       break;
                case 'u':
                        args->uid = a_uid(optarg);
                        set_uid = 1;
diff -r 757513cc4d39 -r f1a4c8945086 sys/fs/msdosfs/direntry.h
--- a/sys/fs/msdosfs/direntry.h Sat Jan 30 05:15:18 2016 +0000
+++ b/sys/fs/msdosfs/direntry.h Sat Jan 30 09:59:27 2016 +0000
@@ -1,4 +1,4 @@
-/*     $NetBSD: direntry.h,v 1.9 2016/01/23 01:26:14 dholland Exp $    */
+/*     $NetBSD: direntry.h,v 1.10 2016/01/30 09:59:27 mlelstv Exp $    */
 
 /*-
  * Copyright (C) 1994, 1995, 1997 Wolfgang Solfrank.
@@ -136,11 +136,12 @@
 int    unix2dosfn(const unsigned char *un, unsigned char dn[12], int unlen,
            unsigned int gen);
 int    unix2winfn(const unsigned char *un, int unlen, struct winentry *wep,
-           int cnt, int chksum);
+           int cnt, int chksum, int utf8);
 int    winChkName(const unsigned char *un, int unlen, struct winentry *wep,
-           int chksum);
-int    win2unixfn(struct winentry *wep, struct dirent *dp, int chksum);
+           int chksum, int utf8);
+int    win2unixfn(struct winentry *wep, struct dirent *dp, int chksum, 
+           int utf8);
 uint8_t winChksum(uint8_t *name);
-int    winSlotCnt(const unsigned char *un, int unlen);
+int    winSlotCnt(const unsigned char *un, int unlen, int utf8);
 #endif /* _KERNEL || MAKEFS */
 #endif /* _MSDOSFS_DIRENTRY_H_ */
diff -r 757513cc4d39 -r f1a4c8945086 sys/fs/msdosfs/msdosfs_conv.c
--- a/sys/fs/msdosfs/msdosfs_conv.c     Sat Jan 30 05:15:18 2016 +0000
+++ b/sys/fs/msdosfs/msdosfs_conv.c     Sat Jan 30 09:59:27 2016 +0000
@@ -1,4 +1,4 @@
-/*     $NetBSD: msdosfs_conv.c,v 1.10 2014/09/01 09:09:47 martin Exp $ */
+/*     $NetBSD: msdosfs_conv.c,v 1.11 2016/01/30 09:59:27 mlelstv Exp $        */
 
 /*-
  * Copyright (C) 1995, 1997 Wolfgang Solfrank.
@@ -45,6 +45,16 @@
  * any damages caused by this software.
  *
  * October 1992
+ *
+ * 
+ * Unicode 5.0 case folding taken from
+ *
+ * http://www.unicode.org/Public/5.0.0/ucd/CaseFolding.txt
+ *
+ * Unicode Character Database
+ * Copyright (c) 1991-2006 Unicode, Inc.
+ * For terms of use, see http://www.unicode.org/terms_of_use.html
+ * For documentation, see UCD.html
  */
 
 #if HAVE_NBTOOL_CONFIG_H
@@ -52,13 +62,14 @@
 #endif
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: msdosfs_conv.c,v 1.10 2014/09/01 09:09:47 martin Exp $");
+__KERNEL_RCSID(0, "$NetBSD: msdosfs_conv.c,v 1.11 2016/01/30 09:59:27 mlelstv Exp $");
 
 /*
  * System include files.
  */
 #include <sys/param.h>
 #include <sys/time.h>
+#include <sys/endian.h>
 #ifdef _KERNEL
 #include <sys/dirent.h>
 #include <sys/systm.h>
@@ -78,6 +89,22 @@
 #include <fs/msdosfs/direntry.h>
 #include <fs/msdosfs/denode.h>
 
+static int invalidname(const u_int16_t *, int);
+
+static int ucs2utf8(const u_int16_t *, u_int8_t *, int);
+static int utf8ucs2(const u_int8_t *, int, u_int16_t *);
+
+static int ucs2utf8str(const u_int16_t *, int, u_int8_t *, int);
+static int utf8ucs2str(const u_int8_t *, int, u_int16_t *, int);
+static int ucs2char8str(const u_int16_t *, int, u_int8_t *, int);
+static int char8ucs2str(const u_int8_t *, int, u_int16_t *, int);
+
+static void ucs2pad(u_int16_t *, int, int);
+
+static u_int16_t ucs2fold(u_int16_t);
+static int ucs2match(u_int16_t *, u_int16_t *, int n);
+static int char8match(u_int16_t *, u_int16_t *, int n);
+
 /*
  * The number of seconds between Jan 1, 1970 and Jan 1, 1980. In that
  * interval there were 8 regular years and 2 leap years.
@@ -284,6 +311,905 @@
        0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* f8-ff */
 };
 
+/* Unicode case folding for codes 0x0000..0xffff */
+static const u_int16_t
+foldmap[] = {
+       0x0041, 0x0061, /* LATIN CAPITAL LETTER A */
+       0x0042, 0x0062, /* LATIN CAPITAL LETTER B */
+       0x0043, 0x0063, /* LATIN CAPITAL LETTER C */
+       0x0044, 0x0064, /* LATIN CAPITAL LETTER D */
+       0x0045, 0x0065, /* LATIN CAPITAL LETTER E */
+       0x0046, 0x0066, /* LATIN CAPITAL LETTER F */
+       0x0047, 0x0067, /* LATIN CAPITAL LETTER G */
+       0x0048, 0x0068, /* LATIN CAPITAL LETTER H */
+       0x0049, 0x0069, /* LATIN CAPITAL LETTER I */
+       0x004A, 0x006A, /* LATIN CAPITAL LETTER J */
+       0x004B, 0x006B, /* LATIN CAPITAL LETTER K */
+       0x004C, 0x006C, /* LATIN CAPITAL LETTER L */
+       0x004D, 0x006D, /* LATIN CAPITAL LETTER M */
+       0x004E, 0x006E, /* LATIN CAPITAL LETTER N */
+       0x004F, 0x006F, /* LATIN CAPITAL LETTER O */
+       0x0050, 0x0070, /* LATIN CAPITAL LETTER P */
+       0x0051, 0x0071, /* LATIN CAPITAL LETTER Q */
+       0x0052, 0x0072, /* LATIN CAPITAL LETTER R */
+       0x0053, 0x0073, /* LATIN CAPITAL LETTER S */
+       0x0054, 0x0074, /* LATIN CAPITAL LETTER T */
+       0x0055, 0x0075, /* LATIN CAPITAL LETTER U */
+       0x0056, 0x0076, /* LATIN CAPITAL LETTER V */
+       0x0057, 0x0077, /* LATIN CAPITAL LETTER W */
+       0x0058, 0x0078, /* LATIN CAPITAL LETTER X */
+       0x0059, 0x0079, /* LATIN CAPITAL LETTER Y */
+       0x005A, 0x007A, /* LATIN CAPITAL LETTER Z */
+       0x00B5, 0x03BC, /* MICRO SIGN */
+       0x00C0, 0x00E0, /* LATIN CAPITAL LETTER A WITH GRAVE */
+       0x00C1, 0x00E1, /* LATIN CAPITAL LETTER A WITH ACUTE */
+       0x00C2, 0x00E2, /* LATIN CAPITAL LETTER A WITH CIRCUMFLEX */
+       0x00C3, 0x00E3, /* LATIN CAPITAL LETTER A WITH TILDE */
+       0x00C4, 0x00E4, /* LATIN CAPITAL LETTER A WITH DIAERESIS */
+       0x00C5, 0x00E5, /* LATIN CAPITAL LETTER A WITH RING ABOVE */
+       0x00C6, 0x00E6, /* LATIN CAPITAL LETTER AE */
+       0x00C7, 0x00E7, /* LATIN CAPITAL LETTER C WITH CEDILLA */
+       0x00C8, 0x00E8, /* LATIN CAPITAL LETTER E WITH GRAVE */
+       0x00C9, 0x00E9, /* LATIN CAPITAL LETTER E WITH ACUTE */
+       0x00CA, 0x00EA, /* LATIN CAPITAL LETTER E WITH CIRCUMFLEX */
+       0x00CB, 0x00EB, /* LATIN CAPITAL LETTER E WITH DIAERESIS */
+       0x00CC, 0x00EC, /* LATIN CAPITAL LETTER I WITH GRAVE */
+       0x00CD, 0x00ED, /* LATIN CAPITAL LETTER I WITH ACUTE */
+       0x00CE, 0x00EE, /* LATIN CAPITAL LETTER I WITH CIRCUMFLEX */
+       0x00CF, 0x00EF, /* LATIN CAPITAL LETTER I WITH DIAERESIS */
+       0x00D0, 0x00F0, /* LATIN CAPITAL LETTER ETH */
+       0x00D1, 0x00F1, /* LATIN CAPITAL LETTER N WITH TILDE */
+       0x00D2, 0x00F2, /* LATIN CAPITAL LETTER O WITH GRAVE */
+       0x00D3, 0x00F3, /* LATIN CAPITAL LETTER O WITH ACUTE */
+       0x00D4, 0x00F4, /* LATIN CAPITAL LETTER O WITH CIRCUMFLEX */
+       0x00D5, 0x00F5, /* LATIN CAPITAL LETTER O WITH TILDE */
+       0x00D6, 0x00F6, /* LATIN CAPITAL LETTER O WITH DIAERESIS */
+       0x00D8, 0x00F8, /* LATIN CAPITAL LETTER O WITH STROKE */
+       0x00D9, 0x00F9, /* LATIN CAPITAL LETTER U WITH GRAVE */
+       0x00DA, 0x00FA, /* LATIN CAPITAL LETTER U WITH ACUTE */
+       0x00DB, 0x00FB, /* LATIN CAPITAL LETTER U WITH CIRCUMFLEX */
+       0x00DC, 0x00FC, /* LATIN CAPITAL LETTER U WITH DIAERESIS */
+       0x00DD, 0x00FD, /* LATIN CAPITAL LETTER Y WITH ACUTE */
+       0x00DE, 0x00FE, /* LATIN CAPITAL LETTER THORN */
+       0x0100, 0x0101, /* LATIN CAPITAL LETTER A WITH MACRON */
+       0x0102, 0x0103, /* LATIN CAPITAL LETTER A WITH BREVE */
+       0x0104, 0x0105, /* LATIN CAPITAL LETTER A WITH OGONEK */
+       0x0106, 0x0107, /* LATIN CAPITAL LETTER C WITH ACUTE */
+       0x0108, 0x0109, /* LATIN CAPITAL LETTER C WITH CIRCUMFLEX */
+       0x010A, 0x010B, /* LATIN CAPITAL LETTER C WITH DOT ABOVE */
+       0x010C, 0x010D, /* LATIN CAPITAL LETTER C WITH CARON */
+       0x010E, 0x010F, /* LATIN CAPITAL LETTER D WITH CARON */
+       0x0110, 0x0111, /* LATIN CAPITAL LETTER D WITH STROKE */
+       0x0112, 0x0113, /* LATIN CAPITAL LETTER E WITH MACRON */
+       0x0114, 0x0115, /* LATIN CAPITAL LETTER E WITH BREVE */
+       0x0116, 0x0117, /* LATIN CAPITAL LETTER E WITH DOT ABOVE */
+       0x0118, 0x0119, /* LATIN CAPITAL LETTER E WITH OGONEK */
+       0x011A, 0x011B, /* LATIN CAPITAL LETTER E WITH CARON */
+       0x011C, 0x011D, /* LATIN CAPITAL LETTER G WITH CIRCUMFLEX */
+       0x011E, 0x011F, /* LATIN CAPITAL LETTER G WITH BREVE */
+       0x0120, 0x0121, /* LATIN CAPITAL LETTER G WITH DOT ABOVE */
+       0x0122, 0x0123, /* LATIN CAPITAL LETTER G WITH CEDILLA */
+       0x0124, 0x0125, /* LATIN CAPITAL LETTER H WITH CIRCUMFLEX */
+       0x0126, 0x0127, /* LATIN CAPITAL LETTER H WITH STROKE */
+       0x0128, 0x0129, /* LATIN CAPITAL LETTER I WITH TILDE */
+       0x012A, 0x012B, /* LATIN CAPITAL LETTER I WITH MACRON */
+       0x012C, 0x012D, /* LATIN CAPITAL LETTER I WITH BREVE */
+       0x012E, 0x012F, /* LATIN CAPITAL LETTER I WITH OGONEK */
+       0x0132, 0x0133, /* LATIN CAPITAL LIGATURE IJ */
+       0x0134, 0x0135, /* LATIN CAPITAL LETTER J WITH CIRCUMFLEX */
+       0x0136, 0x0137, /* LATIN CAPITAL LETTER K WITH CEDILLA */
+       0x0139, 0x013A, /* LATIN CAPITAL LETTER L WITH ACUTE */
+       0x013B, 0x013C, /* LATIN CAPITAL LETTER L WITH CEDILLA */
+       0x013D, 0x013E, /* LATIN CAPITAL LETTER L WITH CARON */
+       0x013F, 0x0140, /* LATIN CAPITAL LETTER L WITH MIDDLE DOT */
+       0x0141, 0x0142, /* LATIN CAPITAL LETTER L WITH STROKE */
+       0x0143, 0x0144, /* LATIN CAPITAL LETTER N WITH ACUTE */
+       0x0145, 0x0146, /* LATIN CAPITAL LETTER N WITH CEDILLA */
+       0x0147, 0x0148, /* LATIN CAPITAL LETTER N WITH CARON */
+       0x014A, 0x014B, /* LATIN CAPITAL LETTER ENG */
+       0x014C, 0x014D, /* LATIN CAPITAL LETTER O WITH MACRON */
+       0x014E, 0x014F, /* LATIN CAPITAL LETTER O WITH BREVE */
+       0x0150, 0x0151, /* LATIN CAPITAL LETTER O WITH DOUBLE ACUTE */
+       0x0152, 0x0153, /* LATIN CAPITAL LIGATURE OE */
+       0x0154, 0x0155, /* LATIN CAPITAL LETTER R WITH ACUTE */
+       0x0156, 0x0157, /* LATIN CAPITAL LETTER R WITH CEDILLA */
+       0x0158, 0x0159, /* LATIN CAPITAL LETTER R WITH CARON */
+       0x015A, 0x015B, /* LATIN CAPITAL LETTER S WITH ACUTE */
+       0x015C, 0x015D, /* LATIN CAPITAL LETTER S WITH CIRCUMFLEX */
+       0x015E, 0x015F, /* LATIN CAPITAL LETTER S WITH CEDILLA */
+       0x0160, 0x0161, /* LATIN CAPITAL LETTER S WITH CARON */
+       0x0162, 0x0163, /* LATIN CAPITAL LETTER T WITH CEDILLA */
+       0x0164, 0x0165, /* LATIN CAPITAL LETTER T WITH CARON */
+       0x0166, 0x0167, /* LATIN CAPITAL LETTER T WITH STROKE */
+       0x0168, 0x0169, /* LATIN CAPITAL LETTER U WITH TILDE */
+       0x016A, 0x016B, /* LATIN CAPITAL LETTER U WITH MACRON */
+       0x016C, 0x016D, /* LATIN CAPITAL LETTER U WITH BREVE */
+       0x016E, 0x016F, /* LATIN CAPITAL LETTER U WITH RING ABOVE */
+       0x0170, 0x0171, /* LATIN CAPITAL LETTER U WITH DOUBLE ACUTE */
+       0x0172, 0x0173, /* LATIN CAPITAL LETTER U WITH OGONEK */
+       0x0174, 0x0175, /* LATIN CAPITAL LETTER W WITH CIRCUMFLEX */
+       0x0176, 0x0177, /* LATIN CAPITAL LETTER Y WITH CIRCUMFLEX */
+       0x0178, 0x00FF, /* LATIN CAPITAL LETTER Y WITH DIAERESIS */
+       0x0179, 0x017A, /* LATIN CAPITAL LETTER Z WITH ACUTE */
+       0x017B, 0x017C, /* LATIN CAPITAL LETTER Z WITH DOT ABOVE */
+       0x017D, 0x017E, /* LATIN CAPITAL LETTER Z WITH CARON */
+       0x017F, 0x0073, /* LATIN SMALL LETTER LONG S */
+       0x0181, 0x0253, /* LATIN CAPITAL LETTER B WITH HOOK */
+       0x0182, 0x0183, /* LATIN CAPITAL LETTER B WITH TOPBAR */
+       0x0184, 0x0185, /* LATIN CAPITAL LETTER TONE SIX */
+       0x0186, 0x0254, /* LATIN CAPITAL LETTER OPEN O */
+       0x0187, 0x0188, /* LATIN CAPITAL LETTER C WITH HOOK */
+       0x0189, 0x0256, /* LATIN CAPITAL LETTER AFRICAN D */
+       0x018A, 0x0257, /* LATIN CAPITAL LETTER D WITH HOOK */



Home | Main Index | Thread Index | Old Index