Initial optimized implementation

This commit is contained in:
Avery Winters 2024-03-15 12:43:32 -05:00
parent 9add76b415
commit 9233682c5a
Signed by: avery
SSH key fingerprint: SHA256:eesvLB5MMqHLZrAMFt6kEhqJWnASMLcET6Sgmw0FqZI
8 changed files with 316 additions and 10 deletions

90
Cargo.lock generated
View file

@ -2,6 +2,24 @@
# It is not intended for manual editing.
version = 3
[[package]]
name = "ahash"
version = "0.8.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e89da841a80418a9b391ebaea17f5c112ffaaa96f621d2c285b5174da76b9011"
dependencies = [
"cfg-if",
"once_cell",
"version_check",
"zerocopy",
]
[[package]]
name = "allocator-api2"
version = "0.2.16"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0942ffc6dcaadf03badf6e6a2d0228460359d5e34b57ccdc720b7382dfbd5ec5"
[[package]]
name = "anstream"
version = "0.6.13"
@ -59,6 +77,15 @@ checksum = "5ad32ce52e4161730f7098c077cd2ed6229b5804ccf99e5366be1ab72a98b4e1"
[[package]]
name = "bilrow-calculate"
version = "0.1.0"
dependencies = [
"anyhow",
"clap",
"fast-float",
"hashbrown",
"indicatif",
"memchr",
"memmap2",
]
[[package]]
name = "bilrow-generate"
@ -141,12 +168,28 @@ version = "0.3.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a357d28ed41a50f9c765dbfe56cbc04a64e53e5fc58ba79fbc34c10ef3df831f"
[[package]]
name = "fast-float"
version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "95765f67b4b18863968b4a1bd5bb576f732b29a4a28c7cd84c09fa3e2875f33c"
[[package]]
name = "fastrand"
version = "2.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "25cbce373ec4653f1a01a31e8a5e5ec0c622dc27ff9c4e6606eefef5cbbed4a5"
[[package]]
name = "hashbrown"
version = "0.14.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "290f1a1d9242c78d09ce40a5e87e7554ee637af1351968159f4952f028f75604"
dependencies = [
"ahash",
"allocator-api2",
]
[[package]]
name = "heck"
version = "0.4.1"
@ -187,12 +230,33 @@ version = "0.2.153"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9c198f91728a82281a64e1f4f9eeb25d82cb32a5de251c6bd1b5154d63a8e7bd"
[[package]]
name = "memchr"
version = "2.7.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "523dc4f511e55ab87b694dc30d0f820d60906ef06413f93d4d7a1385599cc149"
[[package]]
name = "memmap2"
version = "0.9.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fe751422e4a8caa417e13c3ea66452215d7d63e19e604f4980461212f3ae1322"
dependencies = [
"libc",
]
[[package]]
name = "number_prefix"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "830b246a0e5f20af87141b25c173cd1b609bd7779a4617d6ec582abaf90870f3"
[[package]]
name = "once_cell"
version = "1.19.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92"
[[package]]
name = "portable-atomic"
version = "1.6.0"
@ -252,6 +316,12 @@ version = "0.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "711b9620af191e0cdc7468a8d14e709c3dcdb115b36f838e601583af800a370a"
[[package]]
name = "version_check"
version = "0.9.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f"
[[package]]
name = "windows-sys"
version = "0.52.0"
@ -317,3 +387,23 @@ name = "windows_x86_64_msvc"
version = "0.52.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "32b752e52a2da0ddfbdbcc6fceadfeede4c939ed16d13e648833a61dfb611ed8"
[[package]]
name = "zerocopy"
version = "0.7.32"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "74d4d3961e53fa4c9a25a8637fc2bfaf2595b3d3ae34875568a5cf64787716be"
dependencies = [
"zerocopy-derive",
]
[[package]]
name = "zerocopy-derive"
version = "0.7.32"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9ce1b18ccd8e73a9321186f97e46f9f04b778851177567b1975109d26a08d2a6"
dependencies = [
"proc-macro2",
"quote",
"syn",
]

View file

@ -1,3 +1,6 @@
[workspace]
resolver = "2"
members = ["calculate", "generate"]
[profile.release]
debug = true

View file

@ -6,3 +6,10 @@ edition = "2021"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
clap = { version = "4.5.2", features = ["derive"] }
anyhow = "1.0.80"
indicatif = "0.17.8"
memchr = "2.7.1"
memmap2 = "0.9.4"
hashbrown = "0.14.3"
fast-float = "0.2.0"

View file

@ -1,3 +1,34 @@
fn main() {
println!("Hello, world!");
use ::{
anyhow::Result,
clap::{Parser, ValueEnum},
std::path::PathBuf,
};
mod optimized;
mod reference;
fn main() -> Result<()> {
let args = Args::parse();
match args.mode {
Mode::Reference => reference::run(args),
Mode::Optimized => optimized::run(args),
}
}
#[derive(Clone, ValueEnum)]
enum Mode {
Reference,
Optimized,
}
/// Program to generate input files for the one billion row challenge.
#[derive(Parser)]
#[command(version, about)]
struct Args {
#[arg(value_enum, short, long, default_value_t = Mode::Reference)]
mode: Mode,
/// The file path to read the input from
#[arg()]
input: PathBuf,
}

View file

@ -0,0 +1,97 @@
use {
crate::Args,
anyhow::{Context, Result},
fast_float::parse,
hashbrown::HashMap,
indicatif::{ProgressBar, ProgressStyle},
memchr::{memchr, memchr_iter},
memmap2::{Advice, MmapOptions},
std::{
fs::File,
io::{BufWriter, Write},
},
};
struct Data {
min: f64,
max: f64,
sum: f64,
count: u64,
}
const BUFFER_SIZE: usize = 1 << 30;
const UNIQUE_CITY_COUNT: usize = 10_000;
pub(crate) fn run(args: Args) -> Result<()> {
let input = File::open(args.input)?;
let mut mmap_options = MmapOptions::new();
mmap_options.populate();
let input = unsafe { mmap_options.map(&input)? };
input.advise(Advice::Sequential)?;
input.advise(Advice::WillNeed)?;
let input = &input[..];
let style: ProgressStyle = ProgressStyle::with_template(
"[{elapsed}/{duration}] [{bar}] {percent}% ({binary_bytes_per_sec})",
)
.expect("bad progress bar style");
let pb = ProgressBar::new(input.len().try_into()?).with_style(style);
let mut data: HashMap<&[u8], Data> = HashMap::with_capacity(UNIQUE_CITY_COUNT);
let mut start = 0;
let mut last_pb_update: usize = 0;
for pos in memchr_iter(b'\n', input) {
let line = &input[start..pos];
start = pos + 1;
if line.first() == Some(&b'#') {
continue;
}
let split = memchr(b';', line).context("bad input")?;
let city = &line[..split];
let temperature = &line[split + 1..];
let temperature = parse(temperature)?;
let entry = data.entry(city);
entry
.and_modify(|data| {
data.min = data.min.min(temperature);
data.max = data.max.max(temperature);
data.sum += temperature;
data.count += 1;
})
.or_insert_with(|| Data {
min: temperature,
max: temperature,
sum: temperature,
count: 1,
});
if pos - last_pb_update >= 10_000_000 {
pb.set_position(pos.try_into()?);
last_pb_update = pos;
}
}
pb.finish();
let mut data: Vec<_> = data.iter().collect();
data.sort_unstable_by_key(|(&key, _)| key);
let output = std::io::stdout();
let output = output.lock();
let mut output = BufWriter::with_capacity(BUFFER_SIZE, output);
for (
city,
Data {
min,
max,
sum,
count,
},
) in data
{
writeln!(
output,
"{};{min:.1};{:.1};{max:.1}",
std::str::from_utf8(city)?,
sum / *count as f64
)?;
}
Ok(())
}

View file

@ -0,0 +1,81 @@
use {
crate::Args,
::{
anyhow::{Context, Result},
indicatif::{ProgressBar, ProgressStyle},
std::{
collections::BTreeMap,
fs::File,
io::{BufRead, BufReader, BufWriter, Write},
},
},
};
const BUFFER_SIZE: usize = 1 << 30;
pub(crate) fn run(args: Args) -> Result<()> {
let input = File::open(args.input)?;
let len = input.metadata()?.len();
let input = BufReader::with_capacity(BUFFER_SIZE, input);
let style: ProgressStyle = ProgressStyle::with_template(
"[{elapsed}/{duration}] [{bar}] {percent}% ({binary_bytes_per_sec})",
)
.expect("bad progress bar style");
let pb = ProgressBar::new(len).with_style(style);
let input = pb.wrap_read(input);
struct StationData {
min: f64,
max: f64,
sum: f64,
count: u64,
}
let mut stations: BTreeMap<Box<str>, StationData> = BTreeMap::new();
for (i, line) in input.lines().enumerate() {
let line = line?;
if line.starts_with('#') {
continue;
}
let (city, temperature) = line
.split_once(';')
.with_context(|| format!("line {i} malformed"))?;
let temperature = temperature.parse::<f64>()?;
if let Some(data) = stations.get_mut(city) {
data.min = data.min.min(temperature);
data.max = data.max.max(temperature);
data.sum += temperature;
data.count += 1;
} else {
stations.insert(
city.into(),
StationData {
min: temperature,
max: temperature,
sum: temperature,
count: 1,
},
);
}
}
pb.finish();
let output = std::io::stdout();
let output = output.lock();
let mut output = BufWriter::with_capacity(BUFFER_SIZE, output);
for (
city,
StationData {
min,
max,
sum,
count,
},
) in stations
{
writeln!(output, "{city};{min:.1};{:.1};{max:.1}", sum / count as f64)?;
}
Ok(())
}

View file

@ -32,6 +32,7 @@
commands = [
{package = pkgs.clangStdenv;}
{package = pkgs.cargo;}
{package = pkgs.cargo-flamegraph;}
{package = pkgs.nil;}
{package = pkgs.rustc;}
{package = pkgs.rustfmt;}

View file

@ -18,7 +18,7 @@ mod cities;
fn main() -> Result<()> {
let args = Args::parse();
let output = File::create(&args.output_path)?;
let output = File::create(&args.output)?;
let mut output = BufWriter::with_capacity(args.buffer, output);
let mut rng = args
.seed
@ -28,7 +28,7 @@ fn main() -> Result<()> {
writeln!(
output,
"# Generated with `bilrow-generate --cities {} --entries {} --buffer {} --seed {} {:?}`",
args.cities, args.entries, args.buffer, seed, args.output_path
args.cities, args.entries, args.buffer, seed, args.output
)?;
// Fork the RNG so we can change the order we use them in later if we want without
// affecting reproducibility.
@ -39,15 +39,11 @@ fn main() -> Result<()> {
ProgressStyle::with_template("[{elapsed}/{duration}] [{bar}] {percent}% ({per_sec})")
.expect("bad progress bar style");
let pb = ProgressBar::new(args.entries.get()).with_style(style);
const PB_UPDATE_INTERVAL: u64 = 1 << 20;
for i in 0..args.entries.get() {
for _ in pb.wrap_iter(0..args.entries.get()) {
let city = cities_rng.choice(&chosen_cities).expect("no chosen cities");
// Map 0..1 to -99.9..99.9
let temperature = 99.9 * (2.0 * temperature_rng.f64() - 1.0);
writeln!(output, "{city};{temperature:.1}")?;
if i % PB_UPDATE_INTERVAL == 0 {
pb.inc(PB_UPDATE_INTERVAL);
}
}
pb.finish();
Ok(())
@ -74,5 +70,5 @@ struct Args {
/// The file path to write the output to
#[arg()]
output_path: PathBuf,
output: PathBuf,
}