tech-userlevel archive
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]
regsub() proposal
I would like to have sed -e 's/foo\(bar\)foo/\1/'. Most interpretive
languages have easy access to this, but there is no c function that
does this. I propose:
ssize_t
regsub(char *buf, size_t len, const char *sub, const regmatch_t *rm,
const char *str);
ssize_t
aregsub(char **buf, const char *sub, const regmatch_t *rm, const char *str);
regsub() is like snprintf() where aregsub() is like asprintf().
The rest of the arguments are:
sub = the right hand side of the sed substitution containing
\<n> or & escapes.
rm = the regexec() match list. It should be 10 elements long.
str = the input string to be used
They return either the number of characters for the full conversion,
or -1 for error.
Here's the implementation, together with a main program...
christos
/* $NetBSD$ */
/*-
* Copyright (c) 2015 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Christos Zoulas.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__RCSID("$NetBSD$");
#include <sys/param.h>
#include <ctype.h>
#include <stdlib.h>
#include <string.h>
#include <regex.h>
struct str {
char *s_ptr;
size_t s_max;
size_t s_len;
int s_fixed;
};
static int
addspace(struct str *s, size_t len)
{
void *v;
if (s->s_max - s->s_len > len)
return 0;
if (s->s_fixed)
return -1;
s->s_max += MAX(len, 64);
v = realloc(s->s_ptr, s->s_max);
if (v == NULL)
return -1;
s->s_ptr = v;
return 0;
}
static void
addchar(struct str *s, int c)
{
if (addspace(s, 1) == -1)
s->s_len++;
else
s->s_ptr[s->s_len++] = c;
if (c == 0) {
--s->s_len;
s->s_ptr[s->s_max - 1] = c;
}
}
static void
addnstr(struct str *s, const char *buf, size_t len)
{
if (addspace(s, len) != -1)
memcpy(s->s_ptr + s->s_len, buf, len);
s->s_len += len;
}
static int
initstr(struct str *s, char *buf, size_t len)
{
s->s_max = len;
s->s_ptr = buf == NULL ? malloc(len) : buf;
s->s_fixed = buf != NULL;
s->s_len = 0;
return s->s_ptr == NULL ? -1 : 0;
}
static ssize_t
regsub1(char **buf, size_t len, const char *sub,
const regmatch_t *rm, const char *str)
{
ssize_t i;
char c;
struct str s;
if (initstr(&s, *buf, len) == -1)
return -1;
while ((c = *sub++) != '\0') {
switch (c) {
case '&':
i = 0;
break;
case '\\':
if (isdigit((unsigned char)*sub))
i = *sub++ - '0';
else
i = -1;
break;
default:
i = -1;
break;
}
if (i == -1) {
if (c == '\\' && (*sub == '\\' || *sub == '&'))
c = *sub++;
addchar(&s, c);
} else if (rm[i].rm_so != -1 && rm[i].rm_eo != -1) {
size_t l = (size_t)(rm[i].rm_eo - rm[i].rm_so);
addnstr(&s, str + rm[i].rm_so, l);
}
}
addchar(&s, '\0');
if (!s.s_fixed) {
if (s.s_len >= s.s_max) {
free(s.s_ptr);
return -1;
}
*buf = s.s_ptr;
}
return s.s_len;
}
ssize_t
regsub(char *buf, size_t len, const char *sub, const regmatch_t *rm,
const char *str)
{
return regsub1(&buf, len, sub, rm, str);
}
ssize_t
aregsub(char **buf, const char *sub, const regmatch_t *rm, const char *str)
{
*buf = NULL;
return regsub1(buf, 64, sub, rm, str);
}
#include <stdio.h>
#include <regex.h>
#include <stdlib.h>
#include <err.h>
int
main(int argc, char *argv[])
{
regex_t re;
int e;
char buf[1024], *ptr;
regmatch_t rm[10];
if (argc != 4) {
fprintf(stderr, "Usage: %s <pattern> <substitute> <input>\n",
getprogname());
return EXIT_FAILURE;
}
if ((e = regcomp(&re, argv[1], REG_EXTENDED)) != 0) {
regerror(e, &re, buf, sizeof(buf));
errx(EXIT_FAILURE, "regcomp(%s): %s\n", argv[1], buf);
}
switch (e = regexec(&re, argv[2], __arraycount(rm), rm, 0)) {
case 0:
if (aregsub(&ptr, argv[2], rm, argv[3]) < 0)
err(EXIT_FAILURE, "substitution failed");
printf("%s\n", ptr);
free(ptr);
break;
case REG_NOMATCH:
printf("no match: %s\n", argv[3]);
break;
default:
regerror(e, &re, buf, sizeof(buf));
errx(EXIT_FAILURE, "regexec(%s): %s\n", argv[3], buf);
}
return EXIT_SUCCESS;
}
Index: Makefile.inc
===================================================================
RCS file: /cvsroot/src/lib/libc/regex/Makefile.inc,v
retrieving revision 1.7
diff -u -u -r1.7 Makefile.inc
--- Makefile.inc 14 Nov 1997 02:04:46 -0000 1.7
+++ Makefile.inc 9 Jan 2016 03:05:56 -0000
@@ -6,9 +6,9 @@
CPPFLAGS+=-DPOSIX_MISTAKE
-SRCS+= regcomp.c regerror.c regexec.c regfree.c
+SRCS+= regcomp.c regerror.c regexec.c regfree.c regsub.c
MAN+= regex.3 re_format.7
MLINKS+=regex.3 regcomp.3 regex.3 regexec.3 regex.3 regerror.3 \
- regex.3 regfree.3
+ regex.3 regfree.3 regex.3 regsub.3 regex.3 aregsub.3
Index: regex.3
===================================================================
RCS file: /cvsroot/src/lib/libc/regex/regex.3,v
retrieving revision 1.22
diff -u -u -r1.22 regex.3
--- regex.3 17 May 2011 03:35:38 -0000 1.22
+++ regex.3 9 Jan 2016 03:05:57 -0000
@@ -65,7 +65,7 @@
.\"
.\" @(#)regex.3 8.4 (Berkeley) 3/20/94
.\"
-.Dd December 29, 2003
+.Dd January 8, 2016
.Dt REGEX 3
.Os
.Sh NAME
@@ -73,7 +73,9 @@
.Nm regcomp ,
.Nm regexec ,
.Nm regerror ,
-.Nm regfree
+.Nm regfree ,
+.Nm regsub ,
+.Nm aregsub
.Nd regular-expression library
.Sh LIBRARY
.Lb libc
@@ -87,6 +89,10 @@
.Fn regerror "int errcode" "const regex_t * restrict preg" "char * restrict errbuf" "size_t errbuf_size"
.Ft void
.Fn regfree "regex_t *preg"
+.Ft ssize_t
+.Fn regsub "char *buf" "size_t bufsiz" "const char *sub" "const regmatch_t *rm" "const char *str"
+.Ft ssize_t
+.Fn aregsub "char **buf" "const char *sub" "const regmatch_t *rm" "const char *sstr"
.Sh DESCRIPTION
These routines implement
.St -p1003.2-92
@@ -466,6 +472,40 @@
None of these functions references global variables except for tables
of constants;
all are safe for use from multiple threads if the arguments are safe.
+.Pp
+The
+.Fn regsub
+and
+.Fn aregsub
+functions perform substitutions using
+.Xr sed 1
+like syntax.
+They return the length of the string that would have been created
+if there was enough space or
+.Dv \-1
+on error, setting
+.Dv errno .
+The result
+is being placed in
+.Fa buf
+which is user-supplied in
+.Fn regsub
+and dynamically allocated in
+.Fn aregsub .
+The
+.Fa sub
+argument contains a substitution string which might refer to the first
+9 regular expression strings using \e<n> to refer to the nth matched
+item, or \e& to refer to the full match.
+The
+.Fa rm
+array must be at least 10 elements long, and should contain the result
+of the matches from a previous
+.Fn regexec
+call.
+The
+.Fa str
+argument contains the source strign to apply the transformation to.
.Sh IMPLEMENTATION CHOICES
There are a number of decisions that
.St -p1003.2-92
@@ -576,6 +616,13 @@
Altered for inclusion in the
.Bx 4.4
distribution.
+.Pp
+The
+.Fn regsub
+and
+.Fn aregsub
+functions appeared in
+.Nx 8 .
.Sh BUGS
There is one known functionality bug.
The implementation of internationalization is incomplete:
Index: regex.h
===================================================================
RCS file: /cvsroot/src/include/regex.h,v
retrieving revision 1.13
diff -u -u -r1.13 regex.h
--- regex.h 13 Sep 2005 01:44:32 -0000 1.13
+++ regex.h 9 Jan 2016 03:06:21 -0000
@@ -137,6 +137,10 @@
int regexec(const regex_t * __restrict,
const char * __restrict, size_t, regmatch_t [], int);
void regfree(regex_t *);
+#ifdef _NETBSD_SOURCE
+ssize_t regsub(char *, size_t, const char *, const regmatch_t *, const char *);
+ssize_t aregsub(char **buf, const char *, const regmatch_t *, const char *);
+#endif
__END_DECLS
#endif /* !_REGEX_H_ */
Home |
Main Index |
Thread Index |
Old Index