Subject: kern/25010: com(4) can get stuck while sending data
To: None <gnats-bugs@gnats.NetBSD.org>
From: Matthias Scheler <tron@colwyn.zhadum.de>
List: netbsd-bugs
Date: 04/01/2004 16:51:19
>Number:         25010
>Category:       kern
>Synopsis:       com(4) can get stuck while sending data
>Confidential:   no
>Severity:       serious
>Priority:       medium
>Responsible:    kern-bug-people
>State:          open
>Class:          sw-bug
>Submitter-Id:   net
>Arrival-Date:   Thu Apr 01 14:52:00 UTC 2004
>Closed-Date:
>Last-Modified:
>Originator:     
>Release:        NetBSD 2.0B / 1.6.2_STABLE
>Organization:
Matthias Scheler                                  http://scheler.de/~matthias/
>Environment:
System: NetBSD lyssa.zhadum.de 2.0B NetBSD 2.0B (LYSSA) #0: Wed Mar 31 13:13:28 CEST 2004 tron@lyssa.zhadum.de:/src/sys/compile/LYSSA i386

System: NetBSD colwyn.zhadum.de 1.6.2_STABLE NetBSD 1.6.2_STABLE (COLWYN) #0: Sat Mar 20 14:55:07 CET 2004     tron@colwyn.zhadum.de:/src/sys/compile/COLWYN i386

Architecture: i386
Machine: i386

>Description:
While investigating PR kern/18799 FTP connections on my SLIP link often
hung exactly like Tomasz Luchowski reported in the above problem report.
Surprisingly it also happened with my tun(4) based userland SLIP
implementation which I wrote and succesfully tested years ago. After
extensive debugging I began to suspect a bug in com(4) driver used
for the serial ports of both machines. So I wrote a small test program
which works like this:

1.) In sender mode it keeps sending as much data as it can in random
    block sizes. All incoming data will be read in small blocks.

2.) In receive mode (option "-r") it will read all incoming data in
    small blocks and occasionally send out small data blocks after
    it received some data.

Here are the results:

1.6_2_STABLE com(4) sending to 2.0B com(4):	hangs after a short time
1.6_2_STABLE com(4) sending to 2.0B ucom(4):	hangs after a short time
2.0B com(4) sending to 1.6.2_STABLE com(4):	hangs after a short time
2.0B ucom(4) sending to 1.6.2_STABLE com(4):	works several minutes until
						test stopped

So it looks like the com(4) driver can get stuck on sending after a while.
I've tried with and without hardware handshake but it didn't make a
difference.

>How-To-Repeat:
Compile the following program ...

#include <sys/ioctl.h>
#include <sys/param.h>
#include <sys/termios.h>
#include <sys/time.h>
#include <fcntl.h>
#include <paths.h>
#include <unistd.h>

#include <errno.h>
#include <poll.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>

static void
Usage(char *Progname)
{
	(void)fprintf(stderr, 
	    "Usage: %s [-hr] [-s baudrate] [-t timeout] ttyname\n", Progname);
	exit(EXIT_FAILURE);
}

int
main(int argc, char **argv)
{
	char		*progname;
	int		CFlag, Receiver, Speed, Timeout, Opt, TTY;
	int		Index, Sent, Received;
	struct termios	TIO;
	struct pollfd	pfd;
	char		OutBuffer[65536], InBuffer[256];

	progname = argv[0];
	CFlag = HUPCL|CLOCAL;
	Receiver = 0;
	Speed = 9600;
	Timeout = 10;
	while ((Opt = getopt(argc, argv, "hrs:t:")) != -1) {
		switch (Opt) {
		case 'h':
			CFlag |= CRTSCTS;
			break;
		case 'r':
			Receiver = 1;
			break;
		case 's':
			Speed = atoi(optarg);
			break;
		case 't':
			Timeout = atoi(optarg);
			break;
		default:
			Usage(progname);
		}
	}

	argc -= optind;
	argv += optind;
	if (argc != 1)
		Usage(progname);

	if (chdir(_PATH_DEV)) {
		perror(_PATH_DEV);
		return EXIT_FAILURE;
	}

	if ((TTY = open(argv[0], O_RDWR|O_NONBLOCK))<0) {
		perror(argv[0]);
		return EXIT_FAILURE;
	}
	TIO.c_cflag = CREAD|CS8|CFlag;
	TIO.c_iflag = 0;
	TIO.c_lflag = 0;
	TIO.c_oflag = 0;
	TIO.c_cc[VMIN] = 1;
	TIO.c_cc[VTIME] = 0;
	(void) cfsetspeed(&TIO, Speed);
	if (tcsetattr(TTY, TCSADRAIN, &TIO) < 0) {
		perror(argv[1]);
		return EXIT_FAILURE;
	}

	srandom(time(NULL));
	for (Index = 0; Index < sizeof (OutBuffer); Index++)
		OutBuffer[Index] = random();

	pfd.fd = TTY;
	pfd.events = POLLIN|POLLOUT;

	Sent = 0;
	Received = 0;

	for (;;) {
		int 	Num, Size;

		if (Receiver) {
			if (Received > 0 && (random() % 7) == 4)
				pfd.events |= POLLOUT;
			else
				pfd.events &= ~POLLOUT;
		}

		Num = poll(&pfd, 1, Timeout * 1000);
		if (Num < 0) {
			perror("poll");
			break;
		}
		if (Num == 0) {
			(void)printf("No traffic for %d seconds "
			    "after %d bytes received and %d bytes sent\n",
			    Timeout, Received, Sent);
			continue;
		}

		if (pfd.revents & POLLIN) {
			Size = (random() % sizeof (InBuffer)) + 1;
			Num = read(TTY, InBuffer, Size);
			if (Num < 0) {
				perror("read");
				break;
			}
			Received += Num;
		}

		if (pfd.revents & POLLOUT) {
			int	Max, Offset;

			Max = Receiver ? 64 : sizeof (OutBuffer);
			Size = (random() % Max) + 1;
			Offset = (Size == sizeof (OutBuffer)) ?
			    0 : (random() % (sizeof (OutBuffer) - Size));
			Num = write(TTY, &OutBuffer[Offset], Size);
			if (Num < 0) {
				perror("write");
				break;
			}
			Sent += Num;
		}
	}

	return EXIT_FAILURE;
}

... and it like this:

1. machine:	ttytest	-h -s 115200 /dev/tty00
2. machine:	ttytest -r -h -s 115200 /dev/tty00

The problem happens only if the 1. machine is using com(4) as serial driver.
>Fix:
None provided.

>Release-Note:
>Audit-Trail:
>Unformatted: