Subject: Re: NetBSD/systrace error: EINVAL
To: Seth Kurtzberg <seth@cql.com>
From: Kristaps Johnson <kristaps@gradient-enterprises.com>
List: tech-misc
Date: 07/04/2006 17:15:04
>> Seth, the sample code is a mock-up of what was causing me problems from 
>> my original codebase - namely, EINVAL at read.  I duplicated only the 
>> problem-causing code and the steps required to get there (originally in 
>> effort to clarify the problem), then removed all ancillary 
>> error-checking for newspost-ready brevity.
>>
>> I can re-post the sample code with full error-checking but it does not
>> change the results (Invalid argument, meh).
>
> It's better to post the real code.  There could be something significant 
> there that you have missed.  It's unlikely, but it isn't impossible, so 
> I would post the actual code.

I've included it below.  It's one of several files so it won't compile on 
its own.  But the "problem area", i.e., death on read after poll, is in 
here.  As you'll see reading through the code below, the sample I first 
included is a simple "main" function with all the sysjail-specific stuff 
stripped away.  Incidentally, "sysjail" is the main entry function.

I use the same logical chain as systrace.1 to prepare the device and read 
from it, which is one of primary things confusing me.  I keep hoping that 
I'm missing a magic fcntl blessing, or maybe that my read buffer's not 
wide enough.



#include <sys/ioctl.h>
#include <sys/param.h>
#include <sys/types.h>
#include <sys/wait.h>

#if defined(__OpenBSD__)
# include <dev/systrace.h>
#elif defined(__NetBSD__)
# include <sys/systrace.h>
#endif

#include <assert.h>
#include <err.h>
#include <errno.h>
#include <fcntl.h>
#include <paths.h>
#include <poll.h>
#include <signal.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>

#include "intercept.h"
#include "pidctl.h"
#include "report.h"
#include "sysjail.h"
#include "jaildb.h"


/**
  * Sentinel for SIGUSR1 synchronisation between child & parent.
  */
static volatile int	got_sigusr1 = 0;

/**
  * Global jail structure.  TODO.  Put in function calls.
  */
struct sysjail*		jail;

/**
  * Master process's return code, should it return out-of-state.
  */
static int		masterrc = -1;


/**
  * Set that a SIGUSR1 has been detected, do nothing otherwise.
  */
static void		sigfunction(int signum);

/**
  * Register a process with the systrace device.
  *
  * @param sfd The systrace descriptor.
  * @param pid The process to attach.
  *
  * @return -1 on failure, 0 on success.
  */
static int		systrace_pidreg(int sfd, pid_t pid);

/**
  * Initialise the systrace subsystem.
  *
  * @param sfd A pointer to the systrace file descriptor (output).
  *
  * @return -1 on failure, 0 on success.
  */
static int		systrace_init(int *sfd);

/**
  * Read a systrace message when a child has started or stopped.
  *
  * @param msg The systrace message.
  *
  * @return -1 on failure, 0 on success, 1 to indicate that no children
  * are left in the PID database.
  */
static int		readmsg_child(struct str_message *msg);

/**
  * Read & route a systrace message.
  *
  * @param sfd The systrace file descriptor.
  *
  * @return -1 on failure, 0 on success, 1 to indicate that the system may
  * safely exit.
  */
static int		readmsg(int sfd);

/**
  * Loop on the systrace descriptor waiting for messages.
  *
  * @param sfd The systrace file descriptor.
  *
  * @return 0 on success, -1 on failure.
  */
static int		loop(int sfd);

/**
  * Wait on the master process.  Waits without blocking, kills with
  * SIGKILL if the process hangs.
  *
  * @param rc Pointer to the master process's return code.
  *
  * @return 0 on success, -1 on failure (rc will not be set).
  */
static int		waitmaster(int *rc);




int
waitmaster(int *rc)
{
 	int ret, status;
 	pid_t pid;
 	pid = pidctl_getmaster();
 	assert(pid >= 0);
 	if ((ret = waitpid(pid, &status, WNOHANG)) == -1) {
 		rep_error("waitpid: %d", pid);
 		return(-1);
 	} else if (ret == 0) {
 		rep_errorx("waitpid: %d (hung)", pid);
 		if (kill(pid, SIGKILL) == -1) {
 			rep_error("kill: %d, SIGKILL", pid);
 		}
 		return(-1);
 	}
 	if (WIFEXITED(status)) {
 		*rc = WEXITSTATUS(status);
 	} else {
 		*rc = 127;
 	}
 	return(0);
}

void
sigfunction(int signum)
{
 	if (signum == SIGUSR1) {
 		got_sigusr1 = 1;
 	} else {
 		rep_errorx("intercepted signal %d", signum);
 	}
}

int
systrace_pidreg(int sfd, pid_t pid)
{
 	assert(pid >= 0);
 	assert(sfd >= 0);
 	/* Install pid & policy. */
 	if (ioctl(sfd, STRIOCATTACH, &pid) == -1) {
 		rep_error("ioctl: STRIOCATTACH");
 		return(-1);
 	}
 	/* Prepare our intercepts. */
 	if (intercept_init(sfd, pid) == -1) {
 		return(-1);
 	}
 	return(0);
}

int
readmsg_child(struct str_message *msg)
{
 	struct str_msg_child *chld;
 	assert(msg);
 	chld = &msg->msg_data.msg_child;
 	if (chld->new_pid == -1) {
 		rep_reap(msg->msg_pid);
 		(void)pidctl_delpid(msg->msg_pid);
 		/* Check if we've processes left to harvest. */
 		if (pidctl_isempty()) {
 			return(1);
 		}
 		if (pidctl_getmaster() == msg->msg_pid) {
 			rep_errorx("master %d exited with extant children",
 					msg->msg_pid);
 			assert(masterrc == -1);
 			if (waitmaster(&masterrc) == -1) {
 				return(-1);
 			}
 		}
 		return(0);
 	}
 	rep_spawn(chld->new_pid, msg->msg_pid);
 	return(pidctl_addpid(chld->new_pid));
}

int
readmsg(int sfd)
{
 	struct str_message msg;
 	int rc;
 	ssize_t sz;

 	assert(sfd >= 0);
 	sz = read(sfd, &msg, sizeof(struct str_message));
 	if (sz == -1) {
 		rep_error("read (%d): /dev/systrace", sfd);
 		return(-1);
 	} else if (sz != sizeof(msg)) {
 		rep_errorx("read: %d, wanted %d", sz, sizeof(msg));
 		return(-1);
 	}

 	switch (msg.msg_type) {
 		case(SYSTR_MSG_RES):
 			/* FALLTHROUGH */
 		case(SYSTR_MSG_ASK):
 			if ((rc = intercept(sfd, &msg)) == -1) {
 				return(-1);
 			} else if (rc == 1) {
 				return(1);
 			}
 			break;

 		case(SYSTR_MSG_CHILD):
 			if ((rc = readmsg_child(&msg)) == -1) {
 				return(-1);
 			} else if (rc == 1) {
 				return(1);
 			}
 			break;

 		default:
 			/* We should not get here. */
 			assert(0);
 			return(-1);
 	}
 	return(0);
}

/* Nearly verbatim from systrace/intercept.c. */
int
loop(int sfd)
{
 	struct pollfd fd;
 	int ret, rc;
 	assert(sfd >= 0);
 	fd.fd = sfd;
 	fd.events = POLLIN;
 	ret = -1;
 	for (;;) { /* Infinite loop. */
 		if (poll(&fd, 1, -1) == -1) {
 			if (errno == EINTR) {
 				ret = 0;
 				break;
 			}
 			rep_error("poll: /dev/systrace");
 			break;
 		}
 		if (fd.revents & (POLLIN | POLLRDNORM)) {
 			if ((rc = readmsg(sfd)) == -1) {
 				break;
 			} else if (rc == 1) {
 				ret = 0;
 				break;
 			}
 			continue;
 		}
 		return(-1);
 	} /* Infinite loop. */
 	return(ret);
}

int
sysjail_postinit(int sfd, pid_t *pid, struct sysjail *sjail)
{
 	sigset_t none, set, oset;
 	sig_t ohandler;
 	assert(sjail != NULL);
 	assert(pid != NULL);
 	/* Block signals. */
 	sigemptyset(&none);
 	sigemptyset(&set);
 	sigaddset(&set, SIGUSR1);
 	if (sigprocmask(SIG_BLOCK, &set, &oset) == -1) {
 		rep_error("sigprocmask: SIG_BLOCK");
 		return(-1);
 	}
 	if ((ohandler = signal(SIGUSR1, sigfunction)) == SIG_ERR) {
 		rep_error("signal: SIGUSR1");
 		return(-1);
 	}
 	/* This needs to be done before the fork so the child has the
 	 * modified JID. */
 	if (jaildb_add(sjail) == -1) {
 		rep_error("unable to modify jaildb: %s", _PATH_JAILDB);
 		return(-1);
 	}
 	*pid = fork();
 	if (*pid == -1) {
 		(void)jaildb_del(sjail);
 		rep_error("fork");
 		return(-1);

 	} else if (*pid == 0) {
 		/* In child process. */
 		/* Return control immediately after procsesing. */
 		if (kill(getppid(), SIGUSR1) == -1) {
 			rep_error("kill: %d, SIGUSR1", getppid());
 			return(-1);
 		}
 		if (close(sfd) == -1) {
 			rep_error("close: /dev/systrace");
 			return(-1);
 		}

 		/* Carefully chroot & set privileges. */
 		assert(sjail->path != NULL);
 		if (chroot(sjail->path) != 0) {
 			rep_error("chroot: %s", sjail->path);
 			return(-1);
 		}
 		if (chdir("/") != 0) {
 			rep_error("chdir: /");
 			return(-1);
 		}

 		/* Wait for synchronisation signal. */
 		sigsuspend(&none);
 		if ( ! got_sigusr1) {
 			rep_errorx("signal: intercepted wrong signal");
 			return(-1);
 		}
 		if (signal(SIGUSR1, ohandler) == SIG_ERR) {
 			rep_error("signal: SIGUSR1");
 			return(-1);
 		}
 		if (sigprocmask(SIG_SETMASK, &oset, NULL) == -1) {
 			rep_error("sigprocmask: SIG_SETMASK");
 			return(-1);
 		}
 		return(0);
 	}
 	if (signal(SIGTERM, sigfunction) == SIG_ERR) {
 		rep_error("signal: SIGTERM");
 		return(-1);
 	}
 	if (signal(SIGINT, sigfunction) == SIG_ERR) {
 		rep_error("signal: SIGINT");
 		return(-1);
 	}
 	if (siginterrupt(SIGINT, 1) == -1) {
 		rep_error("siginterrupt: SIGINT");
 		return(-1);
 	}
 	if (siginterrupt(SIGTERM, 1) == -1) {
 		rep_error("siginterrupt: SIGTERM");
 		return(-1);
 	}
 	sigsuspend(&none);
 	if ( ! got_sigusr1) {
 		rep_errorx("signal: intercepted wrong signal");
 		return(-1);
 	}
 	/* Still in parent process. */
 	/* Register the initial pid. */
 	rep_begin(*pid);
 	if (systrace_pidreg(sfd, *pid) == -1) {
 		rep_errorx("systrace_pidreg: %d", *pid);
 		return(-1);
 	}
 	pidctl_setmaster(*pid);
 	if (pidctl_addpid(*pid) == -1) {
 		rep_errorx("pidctl_addpid: %d", *pid);
 		return(-1);
 	}
 	/* Restore the child handler. */
 	if (signal(SIGUSR1, ohandler) == SIG_ERR) {
 		rep_error("signal: SIGUSR1");
 		return(-1);
 	}
 	if (sigprocmask(SIG_SETMASK, &oset, NULL) == -1) {
 		rep_error("sigprocmask: SIG_SETMASK");
 		return(-1);
 	}
 	/* Signal the child that we're ready. */
 	if (kill(*pid, SIGUSR1) == -1) {
 		rep_error("kill: %d, SIGUSR1", *pid);
 		return(-1);
 	}
 	return(loop(sfd));
}

int
systrace_init(int *sfd)
{
 	int fd;
 	assert(sfd != NULL);
 	if ((fd = open("/dev/systrace", O_RDWR | O_NONBLOCK, 0)) == -1) {
 		rep_error("open: /dev/systrace");
 	}
#if defined(__OpenBSD__)
 	/* OpenBSD requires us to clone the device descriptor. */
 	if (ioctl(fd, STRIOCCLONE, sfd) == -1) {
 		rep_error("ioctl: STRIOCCLONE");
 		if (close(fd) == -1) {
 			rep_error("close: /dev/systrace");
 		}
 		return(-1);
 	}
 	if (close(fd) == -1) {
 		rep_error("close: /dev/systrace");
 		if (close(*sfd)) {
 			rep_error("close: /dev/systrace");
 		}
 		return(-1);
 	}
#endif
#if defined(__NetBSD__)
 	/* NetBSD needn't clone the device descriptor. */
 	*sfd = fd;
#endif
 	if (fcntl(*sfd, F_SETFD, 1) == -1) {
 		rep_error("fcntl: F_SETFD, 1");
 		if (close(*sfd)) {
 			rep_error("close: /dev/systrace");
 		}
 		return(-1);
 	}
 	if (fcntl(*sfd, F_SETFL, O_NONBLOCK) == -1) {
 		rep_error("fcntl: F_SETFL, 1");
 		if (close(*sfd)) {
 			rep_error("close: /dev/systrace");
 		}
 		return(-1);
 	}
 	return(0);
}

/*---------------------------------------------------------------------
  * Main entrance to sysjail library.
  *-------------------------------------------------------------------*/

int
sysjail(struct sysjail *sjail)
{
 	int rc, sfd;
 	pid_t pid;

 	assert(sjail != NULL);
 	jail = sjail;
 	sfd = -1;
 	jail->master = getpid();

 	if (systrace_init(&sfd) == -1) {
 		return(-1);
 	}
 	assert(sfd >= 0);

 	pid = -1;
 	rc = sysjail_postinit(sfd, &pid, sjail);
 	if (pid <= 0) {
 		/* We return control if we're the child. */
 		return(rc);
 	}

 	/*
 	 * We're in the parent's exit sequence.  This must be done very
 	 * carefully.  Cleanup our environment then reap the child
 	 * process.  Return the child's exit code if set, else 127 if
 	 * the child exits with a signal, else 255 in case of error.
 	 */

 	if (sjail->jid > 0) {
 		(void)jaildb_del(sjail);
 	}
 	if (close(sfd) == -1) {
 		rep_error("close: /dev/systrace");
 		rc = 255;
 	}
 	if (masterrc == -1) {
 		if (waitmaster(&masterrc) == -1) {
 			rc = 255;
 		} else {
 			rc = masterrc;
 		}
 	} else {
 		rc = masterrc;
 	}
 	rep_exit(pid, rc);
 	exit(rc);
 	/* NOTREACHED */
}