Subject: tail(1) fix for large files
To: None <tech-userlevel@netbsd.org>
From: Michael Graff <explorer@flame.org>
List: tech-userlevel
Date: 11/12/2001 17:11:39
Here's a patch I made to the -current version of tail(1), which
gets around a long-standing and damned annoying problem that
tail won't work on files greater than 2 GB, and sometimes less.

The old "tail -f" and "tail -1234" formats would mmap() the file
and search backwards.  The problem is, if the file is too big,
the mmap() fails.

The change I made was to walk backwards in 10 MB chunks, unmapping
and remapping as needed, until the front of the file is found or
the correct number of lines was found.

I should probably walk the file again in the forward direction,
using a mmap(), print, munmap() method, but I don't.  I let the
caller deal with that -- it uses a rather bad loop of fgetc()
and fputc() to print out the remainder of the file.  In practice, it
probably doesn't matter, since 10 MB is large, and the file _is_
output, just less efficiently than it could be.

If I can get a few others to sanity check these diffs, I'll commit
them ASAP.  I did them on little sleep, so I could miss some sort of
edge condition (off by one will get me every damned time.)

--Michael

Index: forward.c
===================================================================
RCS file: /cvsroot/basesrc/usr.bin/tail/forward.c,v
retrieving revision 1.16
diff -u -r1.16 forward.c
--- forward.c	1999/07/21 06:38:49	1.16
+++ forward.c	2001/11/13 01:02:07
@@ -249,40 +249,79 @@
 	long off;
 	struct stat *sbp;
 {
-	off_t size;
+	off_t file_size;
+	off_t file_remaining;
 	char *p;
 	char *start;
+	off_t mmap_size;
+	off_t mmap_offset;
+	off_t mmap_remaining;
 
-	if (!(size = sbp->st_size))
+#define MMAP_MAXSIZE  (10 * 1024 * 1024)
+
+	if (!(file_size = sbp->st_size))
 		return (0);
+	file_remaining = file_size;
 
-	if (size > SIZE_T_MAX) {
-		err(0, "%s: %s", fname, strerror(EFBIG));
-		return (1);
+	if (mmap_size > MMAP_MAXSIZE) {
+		mmap_size = MMAP_MAXSIZE;
+		mmap_offset = file_size - MMAP_MAXSIZE;
+	} else {
+		mmap_size = file_size;
+		mmap_offset = 0;
 	}
 
-	if ((start = mmap(NULL, (size_t)size, PROT_READ,
-	    MAP_FILE|MAP_SHARED, fileno(fp), (off_t)0)) == (caddr_t)-1) {
-		err(0, "%s: %s", fname, strerror(EFBIG));
-		return (1);
-	}
+	while (off) {
+		start = mmap(NULL, (size_t)mmap_size, PROT_READ,
+			     MAP_FILE|MAP_SHARED, fileno(fp), mmap_offset);
+		if (start == MAP_FAILED) {
+			err(0, "%s: %s", fname, strerror(EFBIG));
+			return (1);
+		}
+
+		mmap_remaining = mmap_size;
+		/* Last char is special, ignore whether newline or not. */
+		for (p = start + mmap_remaining - 1 ; --mmap_remaining ; )
+			if (*--p == '\n' && !--off) {
+				++p;
+				break;
+			}
 
-	/* Last char is special, ignore whether newline or not. */
-	for (p = start + size - 1; --size;)
-		if (*--p == '\n' && !--off) {
-			++p;
+		file_remaining -= mmap_size - mmap_remaining;
+
+		if (off == 0)
 			break;
+
+		if (munmap(start, mmap_size)) {
+			err(0, "%s: %s", fname, strerror(errno));
+			return (1);
 		}
 
-	/* Set the file pointer to reflect the length displayed. */
-	size = sbp->st_size - size;
-	WR(p, size);
-	if (fseek(fp, (long)sbp->st_size, SEEK_SET) == -1) {
-		ierr();
-		return (1);
+		if (mmap_offset >= MMAP_MAXSIZE) {
+			mmap_offset -= MMAP_MAXSIZE;
+		} else {
+			mmap_offset = 0;
+			mmap_size = file_remaining;
+		}
 	}
-	if (munmap(start, (size_t)sbp->st_size)) {
+
+	/*
+	 * Output the (perhaps partial) data in this mmap'd block.
+	 */
+	WR(p, mmap_size - mmap_remaining);
+	file_remaining += mmap_size - mmap_remaining;
+	if (munmap(start, mmap_size)) {
 		err(0, "%s: %s", fname, strerror(errno));
+		return (1);
+	}
+
+	/*
+	 * Set the file pointer to reflect the length displayed.
+	 * This will cause the caller to redisplay the data if/when
+	 * needed.
+	 */
+	if (fseeko(fp, file_remaining, SEEK_SET) == -1) {
+		ierr();
 		return (1);
 	}
 	return (0);