tech-userlevel archive

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]

regsub() proposal



I would like to have sed -e 's/foo\(bar\)foo/\1/'. Most interpretive
languages have easy access to this, but there is no c function that
does this. I propose:

ssize_t
regsub(char *buf, size_t len, const char *sub, const regmatch_t *rm,
    const char *str);

ssize_t
aregsub(char **buf, const char *sub, const regmatch_t *rm, const char *str);

regsub() is like snprintf() where aregsub() is like asprintf().

The rest of the arguments are:

	sub = the right hand side of the sed substitution containing
		\<n> or & escapes.
	rm = the regexec() match list. It should be 10 elements long.
	str = the input string to be used

They return either the number of characters for the full conversion,
or -1 for error.

Here's the implementation, together with a main program...

christos

/*	$NetBSD$	*/

/*-
 * Copyright (c) 2015 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Christos Zoulas.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */
#include <sys/cdefs.h>
__RCSID("$NetBSD$");

#include <sys/param.h>
#include <ctype.h>
#include <stdlib.h>
#include <string.h>
#include <regex.h>

struct str {
	char *s_ptr;
	size_t s_max;
	size_t s_len;
	int s_fixed;
};

static int
addspace(struct str *s, size_t len)
{
	void *v;

	if (s->s_max - s->s_len > len)
		return 0;

	if (s->s_fixed)
		return -1;

	s->s_max += MAX(len, 64);

	v = realloc(s->s_ptr, s->s_max);
	if (v == NULL)
		return -1;
	s->s_ptr = v;

	return 0;
}

static void
addchar(struct str *s, int c)
{
	if (addspace(s, 1) == -1)
		s->s_len++;
	else
		s->s_ptr[s->s_len++] = c;
	if (c == 0) {
		--s->s_len;
		s->s_ptr[s->s_max - 1] = c;	
	}
}

static void
addnstr(struct str *s, const char *buf, size_t len)
{
	if (addspace(s, len) != -1)
		memcpy(s->s_ptr + s->s_len, buf, len);
	s->s_len += len;
}

static int
initstr(struct str *s, char *buf, size_t len)
{
	s->s_max = len;
	s->s_ptr = buf == NULL ? malloc(len) : buf;
	s->s_fixed = buf != NULL;
	s->s_len = 0;
	return s->s_ptr == NULL ? -1 : 0;
}

static ssize_t
regsub1(char **buf, size_t len, const char *sub,
    const regmatch_t *rm, const char *str)
{
        ssize_t i;
        char c; 
	struct str s;

	if (initstr(&s, *buf, len) == -1)
		return -1;

        while ((c = *sub++) != '\0') {

		switch (c) {
		case '&':
			i = 0;
			break;
		case '\\':
			if (isdigit((unsigned char)*sub))
				i = *sub++ - '0';
			else
				i = -1;
			break;
		default:
			i = -1;
			break;
		}

                if (i == -1) {
                        if (c == '\\' && (*sub == '\\' || *sub == '&'))
                                c = *sub++;
			addchar(&s, c);
                } else if (rm[i].rm_so != -1 && rm[i].rm_eo != -1) {
                        size_t l = (size_t)(rm[i].rm_eo - rm[i].rm_so);
			addnstr(&s, str + rm[i].rm_so, l);
                }
        }

	addchar(&s, '\0');
	if (!s.s_fixed) {
		if (s.s_len >= s.s_max) {
			free(s.s_ptr);
			return -1;
		}
		*buf = s.s_ptr;
	}
	return s.s_len;
}

ssize_t
regsub(char *buf, size_t len, const char *sub, const regmatch_t *rm,
    const char *str)
{
	return regsub1(&buf, len, sub, rm, str);
}

ssize_t
aregsub(char **buf, const char *sub, const regmatch_t *rm, const char *str)
{
	*buf = NULL;
	return regsub1(buf, 64, sub, rm, str);
}

#include <stdio.h>
#include <regex.h>
#include <stdlib.h>
#include <err.h>


int
main(int argc, char *argv[])
{
	regex_t re;
	int e;
	char buf[1024], *ptr;
	regmatch_t rm[10];

	if (argc != 4) {
		fprintf(stderr, "Usage: %s <pattern> <substitute> <input>\n",
		    getprogname());
		return EXIT_FAILURE;
	}

	if ((e = regcomp(&re, argv[1], REG_EXTENDED)) != 0) {
		regerror(e, &re, buf, sizeof(buf));
		errx(EXIT_FAILURE, "regcomp(%s): %s\n", argv[1], buf);
	}
	switch (e = regexec(&re, argv[2], __arraycount(rm), rm, 0)) {
	case 0:
		if (aregsub(&ptr, argv[2], rm, argv[3]) < 0)
			err(EXIT_FAILURE, "substitution failed");
		printf("%s\n", ptr);
		free(ptr);
		break;
	case REG_NOMATCH:
		printf("no match: %s\n", argv[3]);
		break;
	default:
		regerror(e, &re, buf, sizeof(buf));
		errx(EXIT_FAILURE, "regexec(%s): %s\n", argv[3], buf);
	}
	return EXIT_SUCCESS;
}
Index: Makefile.inc
===================================================================
RCS file: /cvsroot/src/lib/libc/regex/Makefile.inc,v
retrieving revision 1.7
diff -u -u -r1.7 Makefile.inc
--- Makefile.inc	14 Nov 1997 02:04:46 -0000	1.7
+++ Makefile.inc	9 Jan 2016 03:05:56 -0000
@@ -6,9 +6,9 @@
 
 CPPFLAGS+=-DPOSIX_MISTAKE
 
-SRCS+=	regcomp.c regerror.c regexec.c regfree.c
+SRCS+=	regcomp.c regerror.c regexec.c regfree.c regsub.c
 
 MAN+=	regex.3 re_format.7
 
 MLINKS+=regex.3 regcomp.3 regex.3 regexec.3 regex.3 regerror.3 \
-	regex.3 regfree.3
+	regex.3 regfree.3 regex.3 regsub.3 regex.3 aregsub.3
Index: regex.3
===================================================================
RCS file: /cvsroot/src/lib/libc/regex/regex.3,v
retrieving revision 1.22
diff -u -u -r1.22 regex.3
--- regex.3	17 May 2011 03:35:38 -0000	1.22
+++ regex.3	9 Jan 2016 03:05:57 -0000
@@ -65,7 +65,7 @@
 .\"
 .\"	@(#)regex.3	8.4 (Berkeley) 3/20/94
 .\"
-.Dd December 29, 2003
+.Dd January 8, 2016
 .Dt REGEX 3
 .Os
 .Sh NAME
@@ -73,7 +73,9 @@
 .Nm regcomp ,
 .Nm regexec ,
 .Nm regerror ,
-.Nm regfree
+.Nm regfree ,
+.Nm regsub ,
+.Nm aregsub
 .Nd regular-expression library
 .Sh LIBRARY
 .Lb libc
@@ -87,6 +89,10 @@
 .Fn regerror "int errcode" "const regex_t * restrict preg" "char * restrict errbuf" "size_t errbuf_size"
 .Ft void
 .Fn regfree "regex_t *preg"
+.Ft ssize_t
+.Fn regsub "char *buf" "size_t bufsiz" "const char *sub" "const regmatch_t *rm" "const char *str"
+.Ft ssize_t
+.Fn aregsub "char **buf" "const char *sub" "const regmatch_t *rm" "const char *sstr"
 .Sh DESCRIPTION
 These routines implement
 .St -p1003.2-92
@@ -466,6 +472,40 @@
 None of these functions references global variables except for tables
 of constants;
 all are safe for use from multiple threads if the arguments are safe.
+.Pp
+The
+.Fn regsub
+and
+.Fn aregsub
+functions perform substitutions using
+.Xr sed 1
+like syntax.
+They return the length of the string that would have been created
+if there was enough space or
+.Dv \-1
+on error, setting
+.Dv errno .
+The result
+is being placed in
+.Fa buf
+which is user-supplied in
+.Fn regsub
+and dynamically allocated in
+.Fn aregsub .
+The
+.Fa sub
+argument contains a substitution string which might refer to the first
+9 regular expression strings using \e<n> to refer to the nth matched
+item, or \e& to refer to the full match.
+The
+.Fa rm
+array must be at least 10 elements long, and should contain the result
+of the matches from a previous
+.Fn regexec
+call.
+The
+.Fa str
+argument contains the source strign to apply the transformation to.
 .Sh IMPLEMENTATION CHOICES
 There are a number of decisions that
 .St -p1003.2-92
@@ -576,6 +616,13 @@
 Altered for inclusion in the
 .Bx 4.4
 distribution.
+.Pp
+The
+.Fn regsub
+and
+.Fn aregsub
+functions appeared in
+.Nx 8 .
 .Sh BUGS
 There is one known functionality bug.
 The implementation of internationalization is incomplete:
Index: regex.h
===================================================================
RCS file: /cvsroot/src/include/regex.h,v
retrieving revision 1.13
diff -u -u -r1.13 regex.h
--- regex.h	13 Sep 2005 01:44:32 -0000	1.13
+++ regex.h	9 Jan 2016 03:06:21 -0000
@@ -137,6 +137,10 @@
 int	regexec(const regex_t * __restrict,
 	    const char * __restrict, size_t, regmatch_t [], int);
 void	regfree(regex_t *);
+#ifdef _NETBSD_SOURCE
+ssize_t regsub(char *, size_t, const char *, const regmatch_t *, const char *);
+ssize_t aregsub(char **buf, const char *, const regmatch_t *, const char *);
+#endif
 __END_DECLS
 
 #endif /* !_REGEX_H_ */


Home | Main Index | Thread Index | Old Index