package io.markovic.jmh.experiments;

import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.TimeUnit;
import org.apache.commons.lang3.RandomStringUtils;
import org.openjdk.jmh.annotations.Benchmark;
import org.openjdk.jmh.annotations.BenchmarkMode;
import org.openjdk.jmh.annotations.Fork;
import org.openjdk.jmh.annotations.Measurement;
import org.openjdk.jmh.annotations.Mode;
import org.openjdk.jmh.annotations.OutputTimeUnit;
import org.openjdk.jmh.annotations.Param;
import org.openjdk.jmh.annotations.Scope;
import org.openjdk.jmh.annotations.Setup;
import org.openjdk.jmh.annotations.State;
import org.openjdk.jmh.annotations.Warmup;
import org.openjdk.jmh.infra.Blackhole;

@BenchmarkMode(Mode.AverageTime)
@OutputTimeUnit(TimeUnit.NANOSECONDS)
@Warmup(iterations = 5)
@Measurement(iterations = 5)
@State(Scope.Benchmark)
@Fork(2)
public class IteratorGC {
  @Param({"4", "10", "20", "50", "1000"})
  public int numStrings;

  List<String> strings;

  @Setup
  public void setup() {
    strings = new ArrayList<>(numStrings);
    for (int i = 0; i < numStrings; i++) {
      strings.add(RandomStringUtils.random(10));
    }
  }

  @Benchmark
  public void rawForLoop(Blackhole blackhole) {
    for (int i = 0; i < strings.size(); i++) {
      blackhole.consume(strings.get(i));
    }
  }

  @Benchmark
  public void forEachLoop(Blackhole blackhole) {
    for (String s : strings) {
      blackhole.consume(s);
    }
  }

  // RESULTS! (When run with `-prof gc`)
  //
  // Benchmark                                   (numItems)  Mode  Cnt     Score     Error   Units
  // IteratorGC.forEachLoop                                 4  avgt   10    19.030 ±   0.386   ns/op
  // IteratorGC.forEachLoop:·gc.alloc.rate                  4  avgt   10    ≈ 10⁻⁴            MB/sec
  // IteratorGC.forEachLoop:·gc.alloc.rate.norm             4  avgt   10    ≈ 10⁻⁵              B/op
  // IteratorGC.forEachLoop:·gc.count                       4  avgt   10       ≈ 0            counts
  // IteratorGC.forEachLoop                                10  avgt   10    50.100 ±   1.497   ns/op
  // IteratorGC.forEachLoop:·gc.alloc.rate                 10  avgt   10    ≈ 10⁻⁴            MB/sec
  // IteratorGC.forEachLoop:·gc.alloc.rate.norm            10  avgt   10    ≈ 10⁻⁵              B/op
  // IteratorGC.forEachLoop:·gc.count                      10  avgt   10       ≈ 0            counts
  // IteratorGC.forEachLoop                                20  avgt   10    97.925 ±   2.449   ns/op
  // IteratorGC.forEachLoop:·gc.alloc.rate                 20  avgt   10    ≈ 10⁻⁴            MB/sec
  // IteratorGC.forEachLoop:·gc.alloc.rate.norm            20  avgt   10    ≈ 10⁻⁴              B/op
  // IteratorGC.forEachLoop:·gc.count                      20  avgt   10       ≈ 0            counts
  // IteratorGC.forEachLoop                                50  avgt   10   253.550 ±  13.140   ns/op
  // IteratorGC.forEachLoop:·gc.alloc.rate                 50  avgt   10    ≈ 10⁻⁴            MB/sec
  // IteratorGC.forEachLoop:·gc.alloc.rate.norm            50  avgt   10    ≈ 10⁻⁴              B/op
  // IteratorGC.forEachLoop:·gc.count                      50  avgt   10       ≈ 0            counts
  // IteratorGC.forEachLoop                              1000  avgt   10  5065.450 ± 149.429   ns/op
  // IteratorGC.forEachLoop:·gc.alloc.rate               1000  avgt   10    ≈ 10⁻⁴            MB/sec
  // IteratorGC.forEachLoop:·gc.alloc.rate.norm          1000  avgt   10     0.002 ±   0.001    B/op
  // IteratorGC.forEachLoop:·gc.count                    1000  avgt   10       ≈ 0            counts
  // IteratorGC.rawForLoop                                  4  avgt   10    17.050 ±   0.407   ns/op
  // IteratorGC.rawForLoop:·gc.alloc.rate                   4  avgt   10    ≈ 10⁻⁴            MB/sec
  // IteratorGC.rawForLoop:·gc.alloc.rate.norm              4  avgt   10    ≈ 10⁻⁵              B/op
  // IteratorGC.rawForLoop:·gc.count                        4  avgt   10       ≈ 0            counts
  // IteratorGC.rawForLoop                                 10  avgt   10    41.231 ±   1.307   ns/op
  // IteratorGC.rawForLoop:·gc.alloc.rate                  10  avgt   10    ≈ 10⁻⁴            MB/sec
  // IteratorGC.rawForLoop:·gc.alloc.rate.norm             10  avgt   10    ≈ 10⁻⁵              B/op
  // IteratorGC.rawForLoop:·gc.count                       10  avgt   10       ≈ 0            counts
  // IteratorGC.rawForLoop                                 20  avgt   10    82.340 ±   3.602   ns/op
  // IteratorGC.rawForLoop:·gc.alloc.rate                  20  avgt   10    ≈ 10⁻⁴            MB/sec
  // IteratorGC.rawForLoop:·gc.alloc.rate.norm             20  avgt   10    ≈ 10⁻⁴              B/op
  // IteratorGC.rawForLoop:·gc.count                       20  avgt   10       ≈ 0            counts
  // IteratorGC.rawForLoop                                 50  avgt   10   212.377 ±   4.300   ns/op
  // IteratorGC.rawForLoop:·gc.alloc.rate                  50  avgt   10    ≈ 10⁻⁴            MB/sec
  // IteratorGC.rawForLoop:·gc.alloc.rate.norm             50  avgt   10    ≈ 10⁻⁴              B/op
  // IteratorGC.rawForLoop:·gc.count                       50  avgt   10       ≈ 0            counts
  // IteratorGC.rawForLoop                               1000  avgt   10  3960.977 ±  44.419   ns/op
  // IteratorGC.rawForLoop:·gc.alloc.rate                1000  avgt   10    ≈ 10⁻⁴            MB/sec
  // IteratorGC.rawForLoop:·gc.alloc.rate.norm           1000  avgt   10     0.002 ±   0.001    B/op
  // IteratorGC.rawForLoop:·gc.count                     1000  avgt   10       ≈ 0            counts
  //
  // The interesting line to look at for each benchmark is gc.alloc.rate.norm
  // which shows the heap allocation rate per benchmark iteration.
  // NONE OF THE BENCHMARK RUNS ALLOCATE! None whatsoever. The iterator
  // _always_ gets escape-analyzed away by the JVM; that's very unsurprising
  // gives that removing iterator GC overhead was one of the primary reasons
  // why escape analysis was added to the JVM!
  // DON'T BE FOOLED BY THE TIMING RESULTS! Using blackhole.consume prevents
  // clever loop optimizations the JVM can perform, like iteration fusing,
  // loop unrolling etc.
  // We use this benchmark strictly to test GC overhead. To see perf overhead
  // of iterators, head over to IteratorPerf.java.
}