Commit 4ff03dde authored by Jens Getreu's avatar Jens Getreu

replace codebase with version 2 branch

parent 4a61c180
#!/bin/sh
cd doc
./make-doc
cargo rustdoc -p stringsext -- --document-private-items
This diff is collapsed.
[package]
name = "stringsext"
version = "1.7.1"
version = "1.99.0"
authors = ["Jens Getreu <[email protected]>"]
edition = "2018"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
#encoding = { git = "https://github.com/lifthrasiir/rust-encoding" }
# use nighly itertools with bugfix: Merge pull request #135 (2. Aug. 2016)
# minimum version is: 0.4.17!
#itertools = { git = "https://github.com/bluss/rust-itertools" }
docopt = "*"
encoding = "*"
itertools = "*"
encoding_rs= "*"
lazy_static = "*"
memmap = "*"
rand = "*"
docopt = "*"
serde = "*"
serde_derive = "*"
itertools = "*"
scoped_threadpool = "*"
tempdir = "*"
anyhow = "*"
......@@ -5,10 +5,10 @@ title: stringsext - search for multi-byte encoded strings in binary data
**stringsext** is a Unicode enhancement of the *GNU strings* tool with
additional functionalities: **stringsext** recognizes Cyrillic, CJKV
characters and other scripts in all supported multi-byte-encodings,
while *GNU strings* fails in finding any of these scripts in UTF-16 and
many other encodings.
additional functionalities: **stringsext** recognizes Cyrillic, Arabic, CJKV
characters and other scripts in all supported multi-byte-encodings, while
*GNU strings* fails in finding any of these scripts in UTF-16 and many other
encodings.
**stringsext** prints all graphic character sequences in *FILE* or
*stdin* that are at least *MIN* bytes long.
......@@ -22,14 +22,14 @@ encoding is specified, the scan is performed in different threads
simultaneously.
When searching for UTF-16 encoded strings, 96% of all possible two byte
sequences, interpreted as UTF-16 code unit, relate directly to a Unicode
code point. As a result, the probability of encountering valid Unicode
sequences, interpreted as UTF-16 code unit, relate directly to Unicode
codepoints. As a result, the probability of encountering valid Unicode
characters in a random byte stream, interpreted as UTF-16, is also 96%.
In order to reduce this big number of false positives, **stringsext**
provides a parameterizable Unicode-block-filter. See **\--encodings**
provides a parametrizable Unicode-block-filter. See **\--encodings**
option in the manual page for more details.
**stringsext** is mainly useful for determining the Unicode content of
**stringsext** is mainly useful for extracting Unicode content out of
non-text files.
When invoked with `stringsext -e ascii -c i` **stringsext** can be used
......
---
title: 'Todo'
subtitle: ''
author: Jens Getreu
date: 2018-09-24
revision: 1.0
fileext: md
---
* Optimize code by reducing copying: use cow where possible.
* Migrate to [encoding_rs - Rust](https://docs.rs/encoding_rs/0.8.0/encoding_rs/)
Concerned functions are:
* file: `scanner.rs`: `scan_window()`
* file: `finding.rs`: `writer_hint()`, `write_char()`, `write_str()`
* Preformance in: `finding.rs`: `macro_rules! enc_str`:
avoid `format!`, use something like [numtoa - Cargo: packages for
Rust](https://crates.io/crates/numtoa/0.0.7) instead
#!/bin/sh
cd ..
mkdir --parents ./doc/build/html
mkdir --parents ./doc/build/html/_downloads
mkdir --parents ./doc/build/man/man1
mkdir --parents ./build/html
mkdir --parents ./build/html/_downloads
mkdir --parents ./build/man/man1
# Man page
pandoc -f markdown -t html --include-in-header=./doc/source/docutils_basic.css\
./doc/source/stringsext--man.md -o ./doc/build/html/stringsext--man.html
pandoc -s ./doc/source/stringsext--man.md -t man \
-o ./doc/build/man/man1/stringsext.1
gzip -f ./doc/build/man/man1/stringsext.1
pandoc -f markdown -t html --include-in-header=./source/docutils_basic.css\
./source/stringsext--man.md -o ./build/html/stringsext--man.html
pandoc -s ./source/stringsext--man.md -t man \
-o ./build/man/man1/stringsext.1
gzip -f ./build/man/man1/stringsext.1
# Readme page
pandoc -f markdown -t html --include-in-header=./doc/source/docutils_basic.css\
./doc/source/README.md -o ./doc/build/html/index.html
pandoc -f markdown -t html --include-in-header=./source/docutils_basic.css\
./source/README.md -o ./build/html/index.html
cd ./doc/build/html/_downloads
cd ./build/html/_downloads
ln -sf ../../../../target/* .
ln -sf ../../../../doc/build/man/man1/stringsext.1.gz .
ln -sf ../../../build/man/man1/stringsext.1.gz .
cargo rustdoc -p stringsext -- --document-private-items
This diff is collapsed.
//! Custom 7-bit ASCII graphic only encoding.
use encoding::types::{ByteWriter, CodecError, Encoding, RawDecoder, RawEncoder, StringWriter};
use std::convert::Into;
/// A static castable reference to `AsciiGraphicEncoding`.
/// Usage: `let enc = ASCII_GRAPHIC as encoding::EncodingRef`.
pub const ASCII_GRAPHIC: &self::AsciiGraphicEncoding = &self::AsciiGraphicEncoding;
/// This custom encoding is derived from encoding::ASCIIEncoding.
/// The only difference is that it represents only graphic characters. All control characters
/// except tab and space are regarded as invalid.
#[derive(Clone, Copy)]
pub struct AsciiGraphicEncoding;
impl Encoding for AsciiGraphicEncoding {
fn name(&self) -> &'static str {
"ascii"
}
fn whatwg_name(&self) -> Option<&'static str> {
None
}
fn raw_encoder(&self) -> Box<dyn RawEncoder> {
AsciiGraphicEncoder::new()
}
fn raw_decoder(&self) -> Box<dyn RawDecoder> {
AsciiGraphicDecoder::new()
}
}
/// An encoder for ASCII.
#[derive(Clone, Copy)]
pub struct AsciiGraphicEncoder;
impl AsciiGraphicEncoder {
#[allow(clippy::new_ret_no_self)]
pub fn new() -> Box<dyn RawEncoder> {
Box::new(AsciiGraphicEncoder)
}
}
impl RawEncoder for AsciiGraphicEncoder {
fn from_self(&self) -> Box<dyn RawEncoder> {
AsciiGraphicEncoder::new()
}
fn is_ascii_compatible(&self) -> bool {
true
}
fn raw_feed(
&mut self,
input: &str,
output: &mut dyn ByteWriter,
) -> (usize, Option<CodecError>) {
output.writer_hint(input.len());
// all non graphic is unrepresentable
match input
.as_bytes()
.iter()
.position(|&ch| ch >= 0x7F || (ch < 0x20) && (ch != 0x09))
{
Some(first_error) => {
output.write_bytes(&input.as_bytes()[..first_error]);
let len = input[first_error..].chars().next().unwrap().len_utf8();
(
first_error,
Some(CodecError {
upto: (first_error + len) as isize,
cause: "non-graphic character".into(),
}),
)
}
None => {
output.write_bytes(input.as_bytes());
(input.len(), None)
}
}
}
fn raw_finish(&mut self, _output: &mut dyn ByteWriter) -> Option<CodecError> {
None
}
}
/// A decoder for ASCII.
#[derive(Clone, Copy)]
pub struct AsciiGraphicDecoder;
impl AsciiGraphicDecoder {
#[allow(clippy::new_ret_no_self)]
pub fn new() -> Box<dyn RawDecoder> {
Box::new(AsciiGraphicDecoder)
}
}
impl RawDecoder for AsciiGraphicDecoder {
fn from_self(&self) -> Box<dyn RawDecoder> {
AsciiGraphicDecoder::new()
}
fn is_ascii_compatible(&self) -> bool {
true
}
fn raw_feed(
&mut self,
input: &[u8],
output: &mut dyn StringWriter,
) -> (usize, Option<CodecError>) {
output.writer_hint(input.len());
fn write_ascii_bytes(output: &mut dyn StringWriter, buf: &[u8]) {
output.write_str(std::str::from_utf8(buf).unwrap());
}
// all non graphic is error
match input
.iter()
.position(|&ch| ch >= 0x7F || (ch < 0x20) && (ch != 0x09))
{
Some(first_error) => {
write_ascii_bytes(output, &input[..first_error]);
(
first_error,
Some(CodecError {
upto: first_error as isize + 1,
cause: "non graphic character".into(),
}),
)
}
None => {
write_ascii_bytes(output, input);
(input.len(), None)
}
}
}
fn raw_finish(&mut self, _output: &mut dyn StringWriter) -> Option<CodecError> {
None
}
}
#[cfg(test)]
mod tests {
use super::ASCII_GRAPHIC;
use encoding::EncodingRef;
#[test]
fn test_decoder() {
let enc = ASCII_GRAPHIC as EncodingRef;
let mut decoder = enc.raw_decoder();
let mut ret = String::new();
let input = "abc\u{3}\u{3}\u{3}\u{0}def\nghijk".as_bytes();
let (offset, err) = decoder.raw_feed(&input[..], &mut ret);
assert_eq!(ret, "abc");
assert_eq!(offset, 3);
assert_eq!(err.unwrap().upto, 4);
}
}
This diff is collapsed.
//! Help the user with command-line-arguments.
use crate::ascii_enc_label;
use crate::mission::ASCII_FILTER_ALIASSE;
use crate::mission::UNICODE_BLOCK_FILTER_ALIASSE;
use crate::mission::{Missions, MISSIONS};
use crate::options::ARGS;
use crate::AUTHOR;
use crate::VERSION;
use std::process;
use std::str;
/// Function called at the beginning of `stringsext`. When help is printed to the
/// user, the program exits.
pub fn help() {
if ARGS.flag_version {
println!("Version {}, {}", VERSION.unwrap_or("unknown"), AUTHOR);
process::exit(0);
};
if ARGS.flag_debug_options {
println!("GIVEN COMMANDLINE-ARGUMENTS\n");
println!("Input files\n-----------");
for (n, name) in ARGS.arg_FILE.iter().enumerate() {
println!("{} = {}", char::from((n + 65) as u8), name);
}
println!("\nEncoding and filter definitions\n-------------------------------");
for (n, name) in ARGS.flag_encoding.iter().enumerate() {
println!("{} = {}", char::from((n + 97) as u8), name);
}
println!("\n\nPARSED COMMANDLINE-ARGUMENTS\n");
let ms: &'static Missions = &MISSIONS;
for (i, m) in ms.v.iter().enumerate() {
println!(
"Scanner ({})\n-----------\n{:#?}\n",
char::from((i + 97) as u8),
m
);
}
process::exit(0);
};
if ARGS.flag_list_encodings {
// Is there a way to programmatically query a list from `Encoding`?
// This list is taken from the `Encoding` source file (2019-12-11)
// and may not be up to date.
println!("LIST OF AVAILABLE ENCODINGS AND PREDEFINED FILTERS\n");
println!("Format: --encoding=[ENC_NAME],[MIN],[AF,UBF],[GREP]\n\n");
println!("ENC_NAME (Encoding)=");
let list: [&'static str; 41] = [
ascii_enc_label!(),
"Big5",
"EUC-JP",
"EUC-KR",
"GBK",
"IBM866",
"ISO-2022-JP",
"ISO-8859-10",
"ISO-8859-13",
"ISO-8859-14",
"ISO-8859-15",
"ISO-8859-16",
"ISO-8859-2",
"ISO-8859-3",
"ISO-8859-4",
"ISO-8859-5",
"ISO-8859-6",
"ISO-8859-7",
"ISO-8859-8",
"ISO-8859-8-I",
"KOI8-R",
"KOI8-U",
"Shift_JIS",
"UTF-16BE",
"UTF-16LE",
"UTF-8",
"gb18030",
"macintosh",
"replacement",
"windows-1250",
"windows-1251",
"windows-1252",
"windows-1253",
"windows-1254",
"windows-1255",
"windows-1256",
"windows-1257",
"windows-1258",
"windows-874",
"x-mac-cyrillic",
"x-user-defined",
];
// Available encodings
for e in list.iter() {
println!("\t{}", e);
}
println!("\tWarning: this list may be outdated.");
println!(
"\tPlease consult the library `encoding_rs` documentation \
for more available encodings.\n\n"
);
println!("MIN = <number>");
println!("\tOnly strings with at least <number> characters are printed.\n\n");
println!("AF (ASCII-Filter) = <filter name> or <hexadecimal number>");
for (e, b, c) in &ASCII_FILTER_ALIASSE {
let b = format!("{:#x}", b);
println!(
"\t{} = {:>35} ({})",
str::from_utf8(e).unwrap(),
b,
str::from_utf8(c).unwrap().trim()
);
}
println!(
"\tUse predefined filter names above or your own filter starting with `0x...`.\n\n"
);
println!("UBF (Unicode-Block-Filter) = <filter name> or <hexadecimal number>");
for (e, b, c) in &UNICODE_BLOCK_FILTER_ALIASSE {
let b = format!("{:#x}", b);
println!(
"\t{} = {:>18} ({})",
str::from_utf8(e).unwrap(),
b,
str::from_utf8(c).unwrap().trim()
);
}
println!(
"\tUse predefined filter names above or your own filter starting with `0x...`.\n\n"
);
println!("GREP = <ASCII code>");
println!("\tPrint only lines having at least one character with <ASCII-code>.");
println!("\tUseful values are `47` (/) or `92` (\\) for path search.");
println!("\t<ASCII code> can be decimal or hexadecimal and must be < 128.");
process::exit(0);
}
}
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
//! This module deals with commandline arguments and related data
//! This module deals with command-line arguments and directly related data
//! structures.
use docopt::Docopt;
use lazy_static::lazy_static;
use serde_derive::Deserialize;
#[cfg(test)]
pub const FLAG_BYTES_MAX: usize = 0xff; // max of Args.flag_bytes
/// Encoding name literal used when simulating non-built-in
/// ASCII-decoder.
#[macro_export]
macro_rules! ascii_enc_label {
() => {
"ascii"
};
}
/// If no command-line argument `--chars_min` is given
/// and none is specified in `--encoding` use this.
/// Must be one of `--list-encodings`.
#[macro_export]
macro_rules! encoding_default {
() => {
//ascii_enc_label!()
"UTF-8"
};
}
/// Default value, when no `--chars-min` command-line-argument
/// is given. Must be `u8`.
#[macro_export]
macro_rules! chars_min_default {
() => {
4u8
};
}
/// Default value, when no `--counter-offset` command-line-argument
/// is given. Must be of type `ByteCounter`.
#[macro_export]
macro_rules! counter_offset_default {
() => {
0
};
}
/// Default value when no `--output-line-len`
/// command-line-argument is given. Must be `usize`.
#[macro_export]
macro_rules! output_line_len_default {
() => {
60usize
};
}
/// There must be space for at least 3 long Unicode characters,
/// to guarantee progress in streaming. You want much longer lines.
pub const OUTPUT_LINE_LEN_MIN: usize = 12;
/// Help message and string for `Docopt` used to populate the `Args` structure.
const USAGE: &str = "
/// Message printed for command-line `--help`.
const USAGE: &str = concat!(
"
Usage: stringsext [options] [-e ENC...] [--] [FILE...]
stringsext [options] [-e ENC...] [--] [-]
Options:
-c MODE, --control-chars=MODE `p` prints ctrl-chars, `r` replaces with '�'. [default: i]
-e ENC, --encoding=ENC Set (multiple) input search encodings. [default: ascii]
ENC==ENCNAME[,MIN[,UNICODEBLOCK]]
ENCNAME: one of `--list-encodings`.
-a AF --ascii-filter=AF ASCII-filter AF applied after decoding. See
`--list-encodings` for AF examples.
-c, --no-metadata Never print byte-counter, encoding or filter.
-d, --debug-options Show how command-line-options are interpreted.
-e ENC, --encoding=ENC Set (multiple) input search encodings (default: ",
encoding_default!(),
").
ENC==[ENCNAME],[MIN],[AF],[UBF],[GREP-CHAR]
ENCNAME: `ascii`, `utf-8`, `big5`, ...
MIN: overwrites general `--bytes MIN` for this ENC only.
UNICODEBLOCK: search only for characters in range
(defaults to all: U+0..U+10FFFF).
-f, --print-file-name Print the name of the file before each string.
AF (ASCII-FILTER): `all-ctrl`, `0xffff...`, ...
UBF (UNICODE-BLOCK-FILTER: `latin`, `cyrillic`, ...
GREP-CHAR: grep for GREP-CHAR ASCII-code.
See `--list-encodings` for more detail.
-g ASCII, --grep-char=ASCII Grep for characters with ASCII-code in output lines.
-h, --help Display this message.
-l, --list-encodings List available encoding-names for ENCNAME.
-n MIN, --bytes=MIN Minimum length of printed strings. [default: 4]
-l, --list-encodings List predefined encoding and filter names for ENC.
-n NUM, --chars-min=NUM Minimum characters of printed strings (default: ",
chars_min_default!(),
").
-p FILE, --output=FILE Print not to stdout but in file.
-s MIN, --split-bytes=MIN Minimum length of printed split strings. [default: 1]
-t RADIX, --radix=RADIX Enable Byte counter with radix `o`, `x` or `d`.
-V, --version Print version info and exit.
";
-q NUM, --output-line-len=NUM Output line length in UTF-8 bytes (default: ",
output_line_len_default!(),
").
-s NUM, --counter-offset=NUM Start counting input bytes with NUM (default: ",
counter_offset_default!(),
").
-t RADIX, --radix=RADIX Enable byte-counter with radix `o`, `x` or `d`.
-u UBF, --unicode-block-filter=UBF
Unicode-block-filter UBF applied after decoding.
See `--list-encodings` for UBF examples.
-V, --version Print version and exit.
"
);
/// This structure holds the command-line-options and is populated by `docopt`.
/// See man-page and the output of `--list-encodings` and `--help` for more
/// information about their meaning.
#[allow(non_snake_case)]
#[derive(Debug, Deserialize)]
pub struct Args {
/// Pathname of the input data file. `None` defaults to `stdin`.
pub flag_ascii_filter: Option<String>,
pub flag_no_metadata: bool,
pub flag_debug_options: bool,
pub arg_FILE: Vec<String>,
/// Do not filter (valid) control chars.
pub flag_control_chars: ControlChars,
/// A vector holding encodings to scan for.
pub flag_encoding: Vec<String>,
/// Show control characters as '�' (U+FFFD).
pub flag_grep_char: Option<String>,
pub flag_list_encodings: bool,
/// Print version and exit.
pub flag_version: bool,
/// Required minimum length of printed strings in UTF8-Bytes.
pub flag_bytes: Option<u8>,
/// Required minimum length of a split strings to be printed.
pub flag_split_bytes: Option<u8>,
/// The radix of the Byte counter when printed.
pub flag_radix: Option<Radix>,
/// Pathname of the output file. `None` defaults to `stdout`.
pub flag_chars_min: Option<String>,
pub flag_output: Option<String>,
/// Print the name of the file before each string.
pub flag_print_file_name: bool,
}
/// Mode how to print control characters
#[derive(PartialEq, Debug, Deserialize)]
pub enum ControlChars {
/// print all valid characters, without filtering
P,
/// group and replace control characters with '�' (U+FFFD)
R,
/// silently ignore all control characters
I,
pub flag_output_line_len: Option<String>,
pub flag_counter_offset: Option<String>,
pub flag_radix: Option<Radix>,
pub flag_unicode_block_filter: Option<String>,
pub flag_version: bool,
}
/// Radix of the `byte-counter` when printed.
......@@ -79,8 +133,7 @@ pub enum Radix {
}
lazy_static! {
/// Static `Args` stucture.
// TODO? compose custom error type to improve error messages
/// Static `Args` stuct.
pub static ref ARGS : Args = Docopt::new(USAGE)
.and_then(|d| d.deserialize())
.unwrap_or_else(|e| e.exit());
......@@ -93,19 +146,19 @@ mod tests {
/// Are the command-line option read and processed correctly?
#[test]
fn test_arg_parser() {
use super::{Args, ControlChars, Radix, USAGE};
use super::{Args, Radix, USAGE};
use docopt::Docopt;
// The argv. Normally you'd just use `parse` which will automatically
// use `std::env::args()`.
let argv = || {
vec![
"stringsext",
"-c",
"r",
"-d",
"-n",
"10",
"-s",
"11",
"-g",
"64",
"-e",
"ascii",
"-e",
......@@ -114,6 +167,10 @@ mod tests {
"-l",
"-p",
"outfile",
"-q",
"40",
"-s",
"1500",
"-t",
"o",
"infile1",
......@@ -123,21 +180,22 @@ mod tests {
let args: Args = Docopt::new(USAGE)
.and_then(|d| d.argv(argv().into_iter()).deserialize())
.unwrap_or_else(|e| e.exit());
//println!("{:?}",args);
fn s(x: &str) -> String {
x.to_string()
}
assert_eq!(args.arg_FILE[0], "infile1".to_string());
assert_eq!(args.arg_FILE[1], "infile2".to_string());
assert_eq!(args.flag_control_chars, ControlChars::R);
assert_eq!(args.flag_debug_options, true);
assert_eq!(args.flag_encoding, vec![s("ascii"), s("utf-8")]);
assert_eq!(args.flag_version, true);
assert_eq!(args.flag_list_encodings, true);
assert_eq!(args.flag_bytes, Some(10u8));
assert_eq!(args.flag_split_bytes, Some(11u8));
assert_eq!(args.flag_chars_min, Some("10".to_string()));
assert_eq!(args.flag_grep_char, Some("64".to_string()));
assert_eq!(args.flag_radix, Some(Radix::O));
assert_eq!(args.flag_counter_offset, Some("1500".to_string()));
assert_eq!(args.flag_output, Some(s("outfile")));
assert_eq!(args.flag_print_file_name, false);
assert_eq!(args.flag_output_line_len, Some("40".to_string()));
assert_eq!(args.flag_no_metadata, false);
}
}
This diff is collapsed.
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment