grep: option to use GNU regex matching instead of POSIX one.

This fixes problems with NULs in files being scanned, but
 costs +800 bytes. The same can be done to sed (TODO).
This commit is contained in:
Denis Vlasenko 2008-08-09 16:15:14 +00:00
parent fb5902ca5c
commit 3fd15e197e
5 changed files with 155 additions and 36 deletions

View File

@ -21,6 +21,15 @@ config DESKTOP
Select this only if you plan to use busybox on full-blown
desktop machine with common Linux distro, not on an embedded box.
config EXTRA_COMPAT
bool "Provide compatible behavior for rare corner cases (bigger code)"
default n
help
This option makes grep, sed etc handle rare corner cases
(embedded NUL bytes and such). This makes code bigger and uses
some GNU extensions in libc. You probably only need this option
if you plan to run busybox on desktop.
config FEATURE_ASSUME_UNICODE
bool "Assume that 1:1 char/glyph correspondence is not true"
default n

View File

@ -96,6 +96,7 @@ struct globals {
int lines_before;
int lines_after;
char **before_buf;
USE_EXTRA_COMPAT(size_t *before_buf_size;)
int last_line_printed;
#endif
/* globals used internally */
@ -117,6 +118,7 @@ struct globals {
#define lines_before (G.lines_before )
#define lines_after (G.lines_after )
#define before_buf (G.before_buf )
#define before_buf_size (G.before_buf_size )
#define last_line_printed (G.last_line_printed )
#define pattern_head (G.pattern_head )
#define cur_file (G.cur_file )
@ -124,14 +126,24 @@ struct globals {
typedef struct grep_list_data_t {
char *pattern;
regex_t preg;
/* for GNU regex, matched_range must be persistent across grep_file() calls */
#if !ENABLE_EXTRA_COMPAT
regex_t compiled_regex;
regmatch_t matched_range;
#else
struct re_pattern_buffer compiled_regex;
struct re_registers matched_range;
#endif
#define ALLOCATED 1
#define COMPILED 2
int flg_mem_alocated_compiled;
} grep_list_data_t;
static void print_line(const char *line, int linenum, char decoration)
#if !ENABLE_EXTRA_COMPAT
#define print_line(line, line_len, linenum, decoration) \
print_line(line, linenum, decoration)
#endif
static void print_line(const char *line, size_t line_len, int linenum, char decoration)
{
#if ENABLE_FEATURE_GREP_CONTEXT
/* Happens when we go to next file, immediately hit match
@ -139,8 +151,9 @@ static void print_line(const char *line, int linenum, char decoration)
if (linenum < 1)
return;
/* possibly print the little '--' separator */
if ((lines_before || lines_after) && did_print_line &&
last_line_printed != linenum - 1) {
if ((lines_before || lines_after) && did_print_line
&& last_line_printed != linenum - 1
) {
puts("--");
}
/* guard against printing "--" before first line of first file */
@ -152,17 +165,50 @@ static void print_line(const char *line, int linenum, char decoration)
if (PRINT_LINE_NUM)
printf("%i%c", linenum, decoration);
/* Emulate weird GNU grep behavior with -ov */
if ((option_mask32 & (OPT_v|OPT_o)) != (OPT_v|OPT_o))
if ((option_mask32 & (OPT_v|OPT_o)) != (OPT_v|OPT_o)) {
#if !ENABLE_EXTRA_COMPAT
puts(line);
#else
fwrite(line, 1, line_len, stdout);
putchar('\n');
#endif
}
}
#if ENABLE_EXTRA_COMPAT
/* Unlike getline, this one removes trailing '\n' */
static ssize_t FAST_FUNC bb_getline(char **line_ptr, size_t *line_alloc_len, FILE *file)
{
ssize_t res_sz;
char *line;
res_sz = getline(line_ptr, line_alloc_len, file);
line = *line_ptr;
if (res_sz > 0) {
if (line[res_sz - 1] == '\n')
line[--res_sz] = '\0';
} else {
free(line); /* uclibc allocates a buffer even on EOF. WTF? */
}
return res_sz;
}
#endif
static int grep_file(FILE *file)
{
char *line;
smalluint found;
int linenum = 0;
int nmatches = 0;
regmatch_t regmatch;
#if !ENABLE_EXTRA_COMPAT
char *line;
#else
char *line = NULL;
ssize_t line_len;
size_t line_alloc_len;
#define rm_so start[0]
#define rm_eo end[0]
#endif
#if ENABLE_FEATURE_GREP_CONTEXT
int print_n_lines_after = 0;
int curpos = 0; /* track where we are in the circular 'before' buffer */
@ -171,7 +217,13 @@ static int grep_file(FILE *file)
enum { print_n_lines_after = 0 };
#endif /* ENABLE_FEATURE_GREP_CONTEXT */
while ((line = xmalloc_fgetline(file)) != NULL) {
while (
#if !ENABLE_EXTRA_COMPAT
(line = xmalloc_fgetline(file)) != NULL
#else
(line_len = bb_getline(&line, &line_alloc_len, file)) >= 0
#endif
) {
llist_t *pattern_ptr = pattern_head;
grep_list_data_t *gl = gl; /* for gcc */
@ -184,19 +236,35 @@ static int grep_file(FILE *file)
} else {
if (!(gl->flg_mem_alocated_compiled & COMPILED)) {
gl->flg_mem_alocated_compiled |= COMPILED;
xregcomp(&(gl->preg), gl->pattern, reflags);
#if !ENABLE_EXTRA_COMPAT
xregcomp(&gl->compiled_regex, gl->pattern, reflags);
#else
memset(&gl->compiled_regex, 0, sizeof(gl->compiled_regex));
if (re_compile_pattern(gl->pattern, strlen(gl->pattern), &gl->compiled_regex))
bb_error_msg_and_die("bad regex '%s'", gl->pattern);
#endif
}
regmatch.rm_so = 0;
regmatch.rm_eo = 0;
if (regexec(&(gl->preg), line, 1, &regmatch, 0) == 0) {
#if !ENABLE_EXTRA_COMPAT
gl->matched_range.rm_so = 0;
gl->matched_range.rm_eo = 0;
#endif
if (
#if !ENABLE_EXTRA_COMPAT
regexec(&gl->compiled_regex, line, 1, &gl->matched_range, 0) == 0
#else
re_search(&gl->compiled_regex, line, line_len,
/*start:*/ 0, /*range:*/ line_len,
&gl->matched_range) >= 0
#endif
) {
if (!(option_mask32 & OPT_w))
found = 1;
else {
char c = ' ';
if (regmatch.rm_so)
c = line[regmatch.rm_so - 1];
if (gl->matched_range.rm_so)
c = line[gl->matched_range.rm_so - 1];
if (!isalnum(c) && c != '_') {
c = line[regmatch.rm_eo];
c = line[gl->matched_range.rm_eo];
if (!c || (!isalnum(c) && c != '_'))
found = 1;
}
@ -261,7 +329,7 @@ static int grep_file(FILE *file)
/* now print each line in the buffer, clearing them as we go */
while (before_buf[idx] != NULL) {
print_line(before_buf[idx], first_buf_entry_line_num, '-');
print_line(before_buf[idx], before_buf_size[idx], first_buf_entry_line_num, '-');
free(before_buf[idx]);
before_buf[idx] = NULL;
idx = (idx + 1) % lines_before;
@ -277,13 +345,15 @@ static int grep_file(FILE *file)
/* -Fo just prints the pattern
* (unless -v: -Fov doesnt print anything at all) */
if (found)
print_line(gl->pattern, linenum, ':');
print_line(gl->pattern, strlen(gl->pattern), linenum, ':');
} else {
line[regmatch.rm_eo] = '\0';
print_line(line + regmatch.rm_so, linenum, ':');
line[gl->matched_range.rm_eo] = '\0';
print_line(line + gl->matched_range.rm_so,
gl->matched_range.rm_eo - gl->matched_range.rm_so,
linenum, ':');
}
} else {
print_line(line, linenum, ':');
print_line(line, line_len, linenum, ':');
}
}
}
@ -291,12 +361,13 @@ static int grep_file(FILE *file)
else { /* no match */
/* if we need to print some context lines after the last match, do so */
if (print_n_lines_after) {
print_line(line, linenum, '-');
print_line(line, strlen(line), linenum, '-');
print_n_lines_after--;
} else if (lines_before) {
/* Add the line to the circular 'before' buffer */
free(before_buf[curpos]);
before_buf[curpos] = line;
USE_EXTRA_COMPAT(before_buf_size[curpos] = line_len;)
curpos = (curpos + 1) % lines_before;
/* avoid free(line) - we took the line */
line = NULL;
@ -304,13 +375,14 @@ static int grep_file(FILE *file)
}
#endif /* ENABLE_FEATURE_GREP_CONTEXT */
#if !ENABLE_EXTRA_COMPAT
free(line);
#endif
/* Did we print all context after last requested match? */
if ((option_mask32 & OPT_m)
&& !print_n_lines_after && nmatches == max_matches)
break;
}
} /* while (read line) */
/* special-case file post-processing for options where we don't print line
* matches, just filenames and possibly match counts */
@ -428,15 +500,16 @@ int grep_main(int argc, char **argv)
lines_after = Copt;
if (!(option_mask32 & OPT_B)) /* not overridden */
lines_before = Copt;
//option_mask32 |= OPT_A|OPT_B; /* for parser */
}
/* sanity checks */
if (option_mask32 & (OPT_c|OPT_q|OPT_l|OPT_L)) {
option_mask32 &= ~OPT_n;
lines_before = 0;
lines_after = 0;
} else if (lines_before > 0)
before_buf = xzalloc(lines_before * sizeof(char *));
} else if (lines_before > 0) {
before_buf = xzalloc(lines_before * sizeof(before_buf[0]));
USE_EXTRA_COMPAT(before_buf_size = xzalloc(lines_before * sizeof(before_buf_size[0]));)
}
#else
/* with auto sanity checks */
/* -H unsets -h; -c,-q or -l unset -n; -e,-f are lists; -m N */
@ -537,7 +610,7 @@ int grep_main(int argc, char **argv)
if (gl->flg_mem_alocated_compiled & ALLOCATED)
free(gl->pattern);
if (gl->flg_mem_alocated_compiled & COMPILED)
regfree(&(gl->preg));
regfree(&gl->compiled_regex);
free(gl);
free(pattern_head_ptr);
}

View File

@ -9,6 +9,10 @@
* Licensed under GPLv2 or later, see file LICENSE in this tarball for details.
*/
/* for getline() [GNUism] */
#ifndef _GNU_SOURCE
#define _GNU_SOURCE 1
#endif
#include "libbb.h"
/* This function reads an entire line from a text file, up to a newline
@ -55,7 +59,6 @@ char* FAST_FUNC xmalloc_fgets(FILE *file)
return bb_get_chunk_from_file(file, &i);
}
/* Get line. Remove trailing \n */
char* FAST_FUNC xmalloc_fgetline(FILE *file)
{
@ -68,6 +71,44 @@ char* FAST_FUNC xmalloc_fgetline(FILE *file)
return c;
}
#if 0
/* GNUism getline() should be faster (not tested) than a loop with fgetc */
/* Get line, including trailing \n if any */
char* FAST_FUNC xmalloc_fgets(FILE *file)
{
char *res_buf = NULL;
size_t res_sz;
if (getline(&res_buf, &res_sz, file) == -1) {
free(res_buf); /* uclibc allocates a buffer even on EOF. WTF? */
res_buf = NULL;
}
//TODO: trimming to res_sz?
return res_buf;
}
/* Get line. Remove trailing \n */
char* FAST_FUNC xmalloc_fgetline(FILE *file)
{
char *res_buf = NULL;
size_t res_sz;
res_sz = getline(&res_buf, &res_sz, file);
if ((ssize_t)res_sz != -1) {
if (res_buf[res_sz - 1] == '\n')
res_buf[--res_sz] = '\0';
//TODO: trimming to res_sz?
} else {
free(res_buf); /* uclibc allocates a buffer even on EOF. WTF? */
res_buf = NULL;
}
return res_buf;
}
#endif
#if 0
/* Faster routines (~twice as fast). +170 bytes. Unused as of 2008-07.
*

View File

@ -27,6 +27,6 @@ void FAST_FUNC xregcomp(regex_t *preg, const char *regex, int cflags)
{
char *errmsg = regcomp_or_errmsg(preg, regex, cflags);
if (errmsg) {
bb_error_msg_and_die("xregcomp: %s", errmsg);
bb_error_msg_and_die("bad regex '%s': %s", regex, errmsg);
}
}

View File

@ -62,12 +62,8 @@ testing "grep -s nofile - (stdin and nonexisting file, match)" \
"grep -s domatch nonexistent - ; echo \$?" \
"(standard input):domatch\n2\n" "" "nomatch\ndomatch\nend\n"
# This doesn't match GNU behaviour (Binary file input matches)
# acts like GNU grep -a
testing "grep handles binary files" "grep foo input" "foo\n" "\0foo\n\n" ""
# This doesn't match GNU behaviour (Binary file (standard input) matches)
# acts like GNU grep -a
testing "grep handles binary stdin" "grep foo" "foo\n" "" "\0foo\n\n"
testing "grep handles NUL in files" "grep -a foo input" "\0foo\n" "\0foo\n\n" ""
testing "grep handles NUL on stdin" "grep -a foo" "\0foo\n" "" "\0foo\n\n"
testing "grep matches NUL" "grep . input > /dev/null 2>&1 ; echo \$?" \
"0\n" "\0\n" ""