diff --git a/calculate/src/main.rs b/calculate/src/main.rs index bd2b185..409d87b 100644 --- a/calculate/src/main.rs +++ b/calculate/src/main.rs @@ -1,3 +1,5 @@ +#![feature(portable_simd)] + use ::{ anyhow::Result, clap::{Parser, ValueEnum}, diff --git a/calculate/src/optimized.rs b/calculate/src/optimized.rs index 79ab840..b5bbfc2 100644 --- a/calculate/src/optimized.rs +++ b/calculate/src/optimized.rs @@ -10,6 +10,7 @@ use { std::{ fs::File, io::{BufWriter, Write}, + simd::prelude::*, }, }; @@ -33,7 +34,12 @@ pub(crate) fn run(args: Args) -> Result<()> { input.advise(Advice::Sequential)?; input.advise(Advice::WillNeed)?; - let input = &input[..]; + let mut input = &input[..]; + while input.first() == Some(&b'#') { + let pos_nl = memchr(b'\n', input).context("bad input")?; + input = &input[pos_nl + 1..]; + } + let style: ProgressStyle = ProgressStyle::with_template( "[{elapsed}/{duration}] [{bar}] {percent}% ({binary_bytes_per_sec})", ) @@ -134,6 +140,9 @@ fn parse_temp(input: &[u8], sep_pos: usize, nl_pos: usize) -> i64 { sign * (a * 100 + b * 10 + c) } +const LANES: usize = 32; +type Chunk = Simd; + fn find_sep(b: &[u8]) -> Option { memchr(b';', b) } @@ -150,7 +159,82 @@ fn chunk<'input, 'bump>( let mut data: HashMap<&'bump [u8], Data, RandomState> = HashMap::with_capacity_and_hasher(UNIQUE_CITY_COUNT, RandomState::default()); + let sep = Chunk::splat(b';'); + let nl = Chunk::splat(b'\n'); let mut pb_since_last_inc: usize = 0; + 'outer: while input.len() > LANES { + if pb_since_last_inc >= 10_000_000 { + pb.inc(pb_since_last_inc as u64); + pb_since_last_inc = 0; + } + let chunk = Chunk::from_slice(input); + let mut mask_sep = sep.simd_eq(chunk).to_bitmask(); + let mut mask_nl = nl.simd_eq(chunk).to_bitmask(); + + let mut i_sep = 0; + if mask_sep == 0 { + loop { + i_sep += 1; + let chunk_input = &input[i_sep * LANES..]; + if chunk_input.len() < LANES { + break 'outer; + } + let chunk = Chunk::from_slice(chunk_input); + mask_sep = sep.simd_eq(chunk).to_bitmask(); + mask_nl = nl.simd_eq(chunk).to_bitmask(); + if mask_sep != 0 { + break; + } + } + } + + let mut i_nl = i_sep; + if mask_nl == 0 { + loop { + i_nl += 1; + let chunk_input = &input[i_nl * LANES..]; + if chunk_input.len() < LANES { + break 'outer; + } + let chunk = Chunk::from_slice(chunk_input); + mask_nl = nl.simd_eq(chunk).to_bitmask(); + if mask_nl != 0 { + break; + } + } + } + + let offset_sep = mask_sep.trailing_zeros(); + let offset_nl = mask_nl.trailing_zeros(); + let pos_sep = LANES * i_sep + offset_sep as usize; + let pos_nl = LANES * i_nl + offset_nl as usize; + + let city = unsafe { input.get_unchecked(..pos_sep) }; + let temperature = parse_temp(input, pos_sep, pos_nl); + + input = &input[pos_nl + 1..]; + pb_since_last_inc += pos_nl + 1; + + let (_key, data) = data.raw_entry_mut().from_key(city).or_insert_with(|| { + ( + city_bumper.alloc_slice_copy(city), + Data { + min: i64::MAX, + max: i64::MIN, + sum: 0, + count: 0, + }, + ) + }); + if temperature < data.min { + data.min = temperature; + } + if temperature > data.max { + data.max = temperature; + } + data.sum += temperature; + data.count += 1; + } loop { if pb_since_last_inc >= 10_000_000 { pb.inc(pb_since_last_inc as u64); @@ -169,6 +253,7 @@ fn chunk<'input, 'bump>( } let city = unsafe { input.get_unchecked(..pos_sep) }; let temperature = parse_temp(input, pos_sep, pos_nl); + input = &input[pos_nl + 1..]; pb_since_last_inc += pos_nl + 1;