Commit 356f2d67 authored by Jens Getreu's avatar Jens Getreu

count output-line length in charactes (not bytes)

parent 9c62c2e7
......@@ -194,13 +194,13 @@ as *GNU strings* replacement.
**-q** *NUM*, *\--output-line-len*=*NUM*
: Set output-line-length in UTF-8 bytes. Length of the printed output line
in UTF-8 bytes (string-findings only, metadata excluded). The line-length
is limited by some internal buffer size value (see "`OUTPUT_BUF_LEN`" in
source code). A value "`NUM`" bigger than "`OUTPUT_BUF_LEN`" is set to
"`OUT_PUT_LEN`". The longer the line-length is, the fewer strings will be
wrapped to the next line. The downside with long output lines is, that
the scanner loses precision in locating the findings.
: Set the printed output-line-length in UTF-8 characters (string-findings
only, metadata excluded). The line-length is limited by some internal
buffer size value (see "`OUTPUT_BUF_LEN`" in source code). A value
"`NUM`" bigger than "`OUTPUT_BUF_LEN/2`" is set to "`OUTPUT_BUF_LEN/2`".
The longer the line-length is, the fewer strings will be wrapped to the
next line. The downside with long output lines is, that the scanner loses
precision in locating the findings.
**-s** *NUM*, **\--counter-offset**=*NUM*
......
......@@ -142,13 +142,13 @@ impl<'a> Deref for FindingCollection<'a> {
/// not determine its exact position.
pub enum Precision {
/// The finding is located somewhere before `Finding::position`. It is
/// guarantied, that the finding is not farer than `--output-line-len -1`
/// guarantied, that the finding is not farer than 2*`--output-line-len`
/// bytes (or the previous finding from the same scanner) away.
Before,
/// The algorithm could determine the exact position of the `Finding` at
/// `Finding::position`.
Exact,
/// The finding is located some `[1..output_line_len]` bytes after
/// The finding is located some `[1..2* --output_line_len]` bytes after
/// `Finding::position` or - in any case - always before the next
/// `Finding::position`.
After,
......
......@@ -60,10 +60,11 @@ pub struct SplitStr<'a> {
inp: &'a str,
/// Initially points to the first byte of the `inp`-buffer. In case `ok_s` is
/// very long and has `>=ok_s_len_max`, the iterator stops and sends out
/// `ok_s`. Then `inp_start_p` is moved to the first byte after `ok_s` so that
/// the next `next()` deals with the rest of the string. This way the second
/// half will be identified to be the continuation of the first part.
/// very long and has `>=ok_char_nb_max` characters, the iterator stops and
/// sends out `ok_s`. Then `inp_start_p` is moved to the first byte after
/// `ok_s` so that the next `next()` deals with the rest of the string. This
/// way the second half will be identified to be the continuation of the
/// first part.
inp_start_p: *const u8,
/// Points to the first byte after the end of `inp` buffer.
......@@ -75,7 +76,7 @@ pub struct SplitStr<'a> {
/// Criteria that influences the search performed by `next()`. Normally only
/// substrings larger than `>=chars_min_nb` will be returned by `next()`.
/// This rule concerning only substrings touching one o fthe `inp` buffer
/// This rule concerning only substrings touching one of the `inp` buffer
/// boundaries has 2 exceptions:
///
/// 1. When `last_s_was_maybe_cut` is set and
......@@ -109,12 +110,8 @@ pub struct SplitStr<'a> {
utf8f: Utf8Filter,
/// This imposes an additional constraint to the iterator and instructs him
/// to never return substrings longer than `s_len_max`. Usually this is equal
/// the `inp`-buffer's length, but there can be exceptions of longer
/// `inp`-buffers. For example when the previous run has left some
/// non-treated `left_over` bytes which are then prepended to the
/// `inp`-buffer. In the worst case, such an `inp` is then twice as large.
s_len_max: usize,
/// to never return substrings longer than `s_char_nb_max`.
s_char_nb_max: usize,
}
/// This enum describes result variants of the `SplitStr::next()` output.
......@@ -169,7 +166,7 @@ impl<'a> SplitStr<'a> {
last_s_was_maybe_cut: bool,
invalid_bytes_after_inp: bool,
utf8f: Utf8Filter,
s_len_max: usize,
s_char_nb_max: usize,
) -> SplitStr {
unsafe {
SplitStr {
......@@ -187,7 +184,7 @@ impl<'a> SplitStr<'a> {
// We will set this to false later, if `utf8f.grep_char` requires some
// additional checking.
utf8f,
s_len_max,
s_char_nb_max,
}
}
}
......@@ -209,24 +206,21 @@ impl<'a> Iterator for SplitStr<'a> {
let mut ok_s_len = 0usize;
let mut ok_char_nb = 0usize;
// The longest `ok_s` we want to return in one `next()` iteration is
// of length `ok_s_len_max`, which the usual `inp`-buffer size
// when no extra bytes are prepended.
// of length `ok_char_nb_max`.
// When we return such a maximum length string, we
// keep the rest in `inp` for `next()`. Such a long string can only
// appear, when some bytes form the last run had been prepended to
// 'inp'.
let ok_s_len_max = self.s_len_max;
// keep the rest in `inp` for `next()`.
let ok_char_nb_max = self.s_char_nb_max;
// The following loop has 4 exits:
// 1. We finished the whole buffer: `self.p >= self.inp`
// 2. A long string was found: `ok_s_len > ok_s_len_max`,
// 2. A long string was found: `ok_char_nb > ok_char_nb_max`,
// `p` points to the first of the remaining bytes, left
// for the next `next()` run.
// 3. We found a substring at the beginning of the buffer;
// 4. We found a substring in somewhere in middle of the buffer;
// Exit 1. and 2.
while self.p < self.inp_end_p && ok_s_len < ok_s_len_max {
while self.p < self.inp_end_p && ok_char_nb < ok_char_nb_max {
// We do not need an additional boundary check, because we
// know from above that there is at least one character in
// `inp` and there are only valid UTF-8 in here.
......@@ -314,18 +308,14 @@ impl<'a> Iterator for SplitStr<'a> {
// Exit 2 or 3:
let s_touches_right_boundary = unsafe { ok_s_p.add(ok_s_len) } >= self.inp_end_p;
let s_is_maybe_cut =
ok_s_len >= ok_s_len_max || (s_touches_right_boundary && !self.invalid_bytes_after_inp);
let s_is_maybe_cut = ok_char_nb >= ok_char_nb_max
|| (s_touches_right_boundary && !self.invalid_bytes_after_inp);
let s_completes_previous_s = s_touches_left_boundary && self.last_s_was_maybe_cut;
// With this flag we tell the caller, that he should not immediately
// print the returned string, but rather insert it at the the beginning
// of the next input buffer and decode and run `SplitStr` again.
//
// Note, we require, that `ok_s_len` is at least 1 byte SMALLER then
// `self.s_len_max` (`ok_s_len < self.s_len_max`). This way
// we print strings that fill the whole output line directly.
//
// Note, `&& !s_completes_previous_s` guarantees, that
// `s_is_to_be_filtered_again` is only set out for the first part
// of a longer cut string. We only want the first part of string to be
......@@ -341,10 +331,10 @@ impl<'a> Iterator for SplitStr<'a> {
// 2. When the first part (==`!not_completes_previous`) of a longer
// string who touches the right buffer boundary
// (`==s_touches_right_boundary`) did start somewhere in the middle of
// the buffer (==`ok_s_len < self.s_len_max`). We actually could
// the buffer (==`ok_char_nb < self.s_char_nb_max`). We actually could
// print it out now, because it has the minimum length, but we want to
// print the beginning of a every string as long as possible (approx
// `output_line_length`). Instead, we rather set
// `output_line_char_nb_max`). Instead, we rather set
// `s_is_to_be_filtered_again` instruction the caller to insert
// this string at the beginning of the next buffer. Doing so, we
// guarantee, that string beginnings are always assembled, even if they
......@@ -352,12 +342,12 @@ impl<'a> Iterator for SplitStr<'a> {
// `stringsext` through additional filters, e.g. searching for
// particular patterns.
//
// As `ok_char_nb < chars_min_nb` is part of `ok_s_len < self.s_len_max`
// As `ok_char_nb < chars_min_nb` is part of `ok_s_len < self.s_char_nb_max`
// we do not need to add this condition explicitly below.
let s_is_to_be_filtered_again = !s_completes_previous_s
&& s_touches_right_boundary
&& !self.invalid_bytes_after_inp
&& (ok_s_len < self.s_len_max || !grep_char_ok);
&& (ok_char_nb < self.s_char_nb_max || !grep_char_ok);
let s_satisfies_min_char_rule = ok_char_nb >= self.chars_min_nb as usize;
let s_satisfies_grep_char_rule = grep_char_ok;
......@@ -383,7 +373,7 @@ impl<'a> Iterator for SplitStr<'a> {
};
// Exit was 2: prepare the inner state for the next `next()` run.
if ok_s_len >= ok_s_len_max {
if ok_char_nb >= ok_char_nb_max {
self.inp_start_p = self.p;
};
self.last_s_was_maybe_cut = s_is_maybe_cut;
......
......@@ -18,7 +18,8 @@
//!
//! 3. Each thread runs a search in `main::slice` == `scanner::input_buffer`. The
//! search is performed by `scanner::scan()`, which cuts the `scanner::input_buffer`
//! into smaller chunks of size `output_line_len` hereafter called `input_window`.
//! into smaller chunks of size 2*`output_line_char_nb_max` bytes hereafter called
//! `input_window`.
//!
//! 4. The `Decoder` runs through the `input_window`, searches for valid strings and
//! decodes them into UTF-8-chunks.
......@@ -27,7 +28,7 @@
//! analyzed if parts of it satisfy certain filter conditions.
//!
//! 6. Doing so, the `helper::SplitStr` cuts the UTF-8-chunk into even smaller
//! `SplitStr`-chunks not longer than `output_line_len` and sends them back to the
//! `SplitStr`-chunks not longer than `output_line_char_nb_max` and sends them back to the
//! `scanner::scan()` loop.
//!
//! 7. There the `SplitStr`-chunk is packed into a `finding::Finding` object and
......@@ -105,7 +106,8 @@ fn run() -> Result<(), anyhow::Error> {
Some(ref fname) => {
let f = File::create(&Path::new(fname.as_str()))?;
// There is at least one `Mission` in `MISSIONS`.
let output_line_len = MISSIONS[0].output_line_len + OUTPUT_LINE_METADATA_LEN;
let output_line_len =
2 * MISSIONS[0].output_line_char_nb_max + OUTPUT_LINE_METADATA_LEN;
let f = LineWriter::with_capacity(output_line_len, f);
Box::new(f) as Box<dyn Write>
}
......
......@@ -9,8 +9,8 @@ use crate::counter_offset_default;
use crate::encoding_default;
use crate::input::ByteCounter;
use crate::options::ARGS;
use crate::options::OUTPUT_LINE_LEN_MIN;
use crate::output_line_len_default;
use crate::options::OUTPUT_LINE_CHAR_NB_MIN;
use crate::output_line_char_nb_max_default;
use anyhow::{anyhow, Context, Result};
use encoding_rs::*;
use lazy_static::lazy_static;
......@@ -421,10 +421,10 @@ pub struct Mission {
/// A filter, defining additional criteria for a finding to be printed.
pub filter: Utf8Filter,
/// Maximum length of output-lines in UTF-8 bytes. Findings that do not fit,
/// will be wrapped to two or more lines. The label `+` indicates that this
/// line is the continuation of the previous line.
pub output_line_len: usize,
/// Maximum length of output-lines in UTF-8 characters. Findings that do not
/// fit, will be wrapped to two or more lines. The label `+` indicates that
/// this line is the continuation of the previous line.
pub output_line_char_nb_max: usize,
/// The `encoding_rs` decoder has no direct support for ASCII. As a
/// workaround, we simulate the missing ASCII-decoder with the
......@@ -570,11 +570,11 @@ impl Missions {
let flag_output_line_len =
parse_integer!(flag_output_line_len, usize::from_str_radix, usize::from_str);
if let Some(m) = flag_output_line_len {
if m < OUTPUT_LINE_LEN_MIN {
if m < OUTPUT_LINE_CHAR_NB_MIN {
return Err(anyhow!(
"minimum for `--output-line-len` is `{}`, \
you tried: `{}`.",
OUTPUT_LINE_LEN_MIN,
OUTPUT_LINE_CHAR_NB_MIN,
m
));
}
......@@ -615,19 +615,19 @@ impl Missions {
},
};
let output_line_len = match flag_output_line_len {
let output_line_char_nb_max = match flag_output_line_len {
Some(n) => n,
None => output_line_len_default!(),
None => output_line_char_nb_max_default!(),
};
if output_line_len < OUTPUT_LINE_LEN_MIN {
if output_line_char_nb_max < OUTPUT_LINE_CHAR_NB_MIN {
return Err(anyhow!(
"Scanner {}: \
minimum for `--output-line-len` is `{}`, \
you tried: `{}`.",
char::from((mission_id + 97) as u8),
OUTPUT_LINE_LEN_MIN,
output_line_len,
OUTPUT_LINE_CHAR_NB_MIN,
output_line_char_nb_max,
));
}
......@@ -703,7 +703,7 @@ impl Missions {
encoding,
chars_min_nb,
filter,
output_line_len,
output_line_char_nb_max,
mission_id: mission_id as u8,
print_encoding_as_ascii,
});
......
......@@ -46,7 +46,7 @@ macro_rules! counter_offset_default {
/// Default value when no `--output-line-len`
/// command-line-argument is given. Must be `usize`.
#[macro_export]
macro_rules! output_line_len_default {
macro_rules! output_line_char_nb_max_default {
() => {
60usize
};
......@@ -54,7 +54,7 @@ macro_rules! output_line_len_default {
/// There must be space for at least 3 long Unicode characters,
/// to guarantee progress in streaming. You want much longer lines.
pub const OUTPUT_LINE_LEN_MIN: usize = 12;
pub const OUTPUT_LINE_CHAR_NB_MIN: usize = 6;
/// Message printed for command-line `--help`.
const USAGE: &str = concat!(
......@@ -84,8 +84,8 @@ Options:
chars_min_default!(),
").
-p FILE, --output=FILE Print not to stdout but in file.
-q NUM, --output-line-len=NUM Output line length in UTF-8 bytes (default: ",
output_line_len_default!(),
-q NUM, --output-line-len=NUM Output line length in UTF-8 characters (default: ",
output_line_char_nb_max_default!(),
").
-s NUM, --counter-offset=NUM Start counting input bytes with NUM (default: ",
counter_offset_default!(),
......
......@@ -91,7 +91,7 @@ impl<'a> ScannerState {
// We multiply `mission.chars_min_nb` by 4, because it is
// counted Unicode-codepoints and a codepoint can have
// maximum 4 bytes in UTF-8.
last_scan_run_leftover: String::with_capacity(mission.output_line_len as usize),
last_scan_run_leftover: String::with_capacity(mission.output_line_char_nb_max as usize),
last_run_str_was_printed_and_is_maybe_cut_str: false,
consumed_bytes: mission.counter_offset,
}
......@@ -153,8 +153,8 @@ pub fn scan<'a>(
ss.last_run_str_was_printed_and_is_maybe_cut_str;
// In many encodings (e.g. UTF16), to fill one `output_line` we need more bytes of input.
// When the string gets longer than `output_line_len`, `SplitStr` will wrap the line.
let decoder_input_window = 2 * ss.mission.output_line_len;
// If ever the string gets longer than `output_line_char_nb_max`, `SplitStr` will wrap the line.
let decoder_input_window = 2 * ss.mission.output_line_char_nb_max;
let mut is_last_window = false;
// iterate over `input_buffer with ``decoder_input_window`-sized slices.
......@@ -270,7 +270,7 @@ pub fn scan<'a>(
continue_str_if_possible,
invalid_bytes_after_split_str_buffer,
ss.mission.filter,
ss.mission.output_line_len,
ss.mission.output_line_char_nb_max,
) {
if !chunk.s_is_to_be_filtered_again {
// We keep it for printing.
......@@ -377,7 +377,7 @@ mod tests {
chars_min_nb: 3,
// this is a pass all filter
filter: UTF8_FILTER_ALL_VALID,
output_line_len: 10,
output_line_char_nb_max: 10,
};
}
lazy_static! {
......@@ -389,7 +389,7 @@ mod tests {
chars_min_nb: 3,
// this is a pass all filter
filter: UTF8_FILTER_LATIN,
output_line_len: 10,
output_line_char_nb_max: 10,
};
}
......@@ -406,7 +406,7 @@ mod tests {
ubf: UBF_LATIN,
grep_char: Some(42),
},
output_line_len: 10,
output_line_char_nb_max: 10,
};
}
......@@ -418,7 +418,7 @@ mod tests {
encoding: &Encoding::for_label(("x-user-defined").as_bytes()).unwrap(),
chars_min_nb: 3,
filter: UTF8_FILTER_ALL_VALID,
output_line_len: 10,
output_line_char_nb_max: 10,
};
}
lazy_static! {
......@@ -434,7 +434,7 @@ mod tests {
ubf: UBF_NONE,
grep_char: None,
},
output_line_len: 10,
output_line_char_nb_max: 10,
};
}
lazy_static! {
......@@ -446,7 +446,7 @@ mod tests {
chars_min_nb: 4,
// this is a pass all filter
filter: UTF8_FILTER_LATIN,
output_line_len: 60,
output_line_char_nb_max: 60,
};
}
#[test]
......@@ -659,12 +659,12 @@ mod tests {
assert_eq!(fc.v[0].position, 10015);
assert_eq!(fc.v[0].position_precision, Precision::Before);
assert_eq!(fc.v[0].s, "no no€St");
assert_eq!(fc.v[0].s, "no no€Stre");
// Here the line is full.
assert_eq!(fc.v[1].position, 10015);
assert_eq!(fc.v[1].position_precision, Precision::After);
assert_eq!(fc.v[1].s, "ream end.");
assert_eq!(fc.v[1].s, "am end.");
assert_eq!(fc.first_byte_position, 10015);
assert_eq!(fc.str_buf_overflow, false);
......@@ -804,7 +804,7 @@ mod tests {
assert_eq!(fc.first_byte_position, 10_000);
assert_eq!(fc.str_buf_overflow, false);
assert_eq!(fc.v.len(), 3);
assert_eq!(fc.v.len(), 2);
assert_eq!(fc.v[0].position, 10_000);
assert_eq!(fc.v[0].position_precision, Precision::Exact);
......@@ -813,11 +813,7 @@ mod tests {
assert_eq!(fc.v[1].position, 10_000);
assert_eq!(fc.v[1].position_precision, Precision::After);
assert_eq!(fc.v[1].s, "\u{f782}h\u{f783}ijk");
assert_eq!(fc.v[2].position, 10_000);
assert_eq!(fc.v[2].position_precision, Precision::After);
assert_eq!(fc.v[2].s, "\u{f789}\u{f790}");
assert_eq!(fc.v[1].s, "\u{f782}h\u{f783}ijk\u{f789}\u{f790}");
assert_eq!(
// We only compare the first 35 bytes, the others are 0 anyway.
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment