-
Notifications
You must be signed in to change notification settings - Fork 22
Fix Hodges-Lehmann distribution ratio calculation #111
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,110 @@ | ||
| package com.atlassian.performance.tools.report.api.distribution | ||
|
|
||
| import com.numericalmethod.suanshu.stats.test.rank.wilcoxon.WilcoxonRankSum | ||
| import org.apache.commons.math3.stat.descriptive.rank.Median | ||
| import org.apache.commons.math3.stat.ranking.NaNStrategy | ||
|
|
||
| class DistributionComparator private constructor( | ||
| private val baseline: DoubleArray, | ||
| private val experiment: DoubleArray, | ||
| /** | ||
| * A ratio by which experiment can be slower/faster than baseline and not considered as a regression/improvement | ||
| */ | ||
| private val tolerance: Double, | ||
| private val significance: Double | ||
| ) { | ||
|
|
||
|
|
||
| /** | ||
| * Performs a one-tailed Mann–Whitney U test to check whether experiment is slower than the baseline | ||
| * | ||
| * @return true if the experiment is slower than the baseline by more than tolerance, false otherwise | ||
| */ | ||
| private fun isExperimentRegressed(baselineMedian: Double): Boolean { | ||
| val mu = -tolerance * baselineMedian | ||
| return WilcoxonRankSum(baseline, experiment, mu).pValue1SidedLess < significance | ||
| } | ||
|
|
||
| private fun isExperimentImproved(baselineMedian: Double): Boolean { | ||
| val mu = -tolerance * baselineMedian | ||
| return WilcoxonRankSum(experiment, baseline, mu).pValue1SidedLess < significance | ||
| } | ||
|
|
||
| /** | ||
| * Pseudo-median: the median of the Walsh (pairwise) averages | ||
| */ | ||
| private fun pseudoMedian(array: DoubleArray): Double { | ||
| val n = array.size | ||
| val size = n * (n + 1) / 2 - n | ||
| val values = DoubleArray(size) | ||
| var k = 0 | ||
| for (i in 0 until n) { | ||
| for (j in i + 1 until n) { | ||
| values[k++] = (array[i] + array[j]) / 2 | ||
| } | ||
| } | ||
| return Median().evaluate(values) | ||
| } | ||
|
|
||
| private fun median(func: (xi: Double, yj: Double) -> Double): Double { | ||
| val values = DoubleArray(baseline.size * experiment.size) | ||
| var k = 0 | ||
| for (i in baseline.indices) { | ||
| for (j in experiment.indices) { | ||
| values[k++] = func(baseline[i], experiment[j]) | ||
| } | ||
| } | ||
| return Median().withNaNStrategy(NaNStrategy.MINIMAL).evaluate(values) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. When used on a set of latency measurements, why would we inject fake negative infinities?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, but the workaround injects fake extreme values. Both "before" and "after" seem wrong.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The only value of this PR is to have a |
||
| } | ||
|
|
||
| private fun shift(): Double { | ||
| return median { xi, yj -> yj - xi } | ||
| } | ||
|
|
||
| private fun ratio(): Double { | ||
| return median { xi, yj -> yj / xi } | ||
| } | ||
|
|
||
| /** | ||
| * Calculates the distance between two data sets based on the [Hodges-Lehmann estimator][]. | ||
| * [Hodges-Lehmann estimator]: https://en.wikipedia.org/wiki/Hodges%E2%80%93Lehmann_estimator | ||
| * https://aakinshin.net/hodges-lehmann-estimator/ | ||
| * https://github.com/AndreyAkinshin/perfolizer/blob/master/src/Perfolizer/Perfolizer/Mathematics/GenericEstimators/HodgesLehmannEstimator.cs | ||
| * | ||
| * Takes into account tolerance which answers the question "is change is big enough to matter?" | ||
| */ | ||
| fun compare(): DistributionComparison { | ||
| val experimentShift = shift() | ||
| val baselineMedian = pseudoMedian(baseline) | ||
| val experimentRatio = ratio() | ||
| val isExperimentImproved = isExperimentImproved(baselineMedian) | ||
| val isExperimentRegressed = isExperimentRegressed(baselineMedian) | ||
| return DistributionComparison( | ||
| experimentRelativeChange = experimentRatio - 1, | ||
| experimentShift = experimentShift, | ||
| isExperimentRegressed = isExperimentRegressed, | ||
| isExperimentImproved = isExperimentImproved | ||
| ) | ||
| } | ||
|
|
||
| class Builder( | ||
| private var baseline: DoubleArray, | ||
| private var experiment: DoubleArray | ||
| ) { | ||
| private var significance: Double = 0.05 | ||
| private var tolerance: Double = 0.01 | ||
|
|
||
| fun significance(significance: Double) = apply { this.significance = significance } | ||
| fun tolerance(tolerance: Double) = apply { this.tolerance = tolerance } | ||
| fun baseline(baseline: DoubleArray) = apply { this.baseline = baseline } | ||
| fun experiment(experiment: DoubleArray) = apply { this.experiment = experiment } | ||
|
|
||
| fun build() = DistributionComparator( | ||
| baseline = baseline, | ||
| experiment = experiment, | ||
| tolerance = tolerance, | ||
| significance = significance | ||
| ) | ||
|
|
||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,18 @@ | ||
| package com.atlassian.performance.tools.report.api.distribution | ||
|
|
||
| class DistributionComparison( | ||
| val experimentRelativeChange: Double, | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ratio or percentage? |
||
| val experimentShift: Double, | ||
| val isExperimentRegressed: Boolean, | ||
| val isExperimentImproved: Boolean | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The |
||
| ) { | ||
|
|
||
| init { | ||
| if (isExperimentImproved && isExperimentRegressed) { | ||
| throw IllegalArgumentException("Experiment can't be both regressed and improved at the same time") | ||
| } | ||
| } | ||
|
|
||
| fun hasImpact() = isExperimentRegressed || isExperimentImproved | ||
|
|
||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -2,7 +2,7 @@ package com.atlassian.performance.tools.report.api.judge | |
|
|
||
| import com.atlassian.performance.tools.jiraactions.api.ActionType | ||
| import com.atlassian.performance.tools.report.ActionMetricsReader | ||
| import com.atlassian.performance.tools.report.api.ShiftedDistributionRegressionTest | ||
| import com.atlassian.performance.tools.report.api.distribution.DistributionComparator | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No difference detected by
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No diff here
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So we don't see why the new one is better.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. How do you define better? Classification has not changed for unit tested cases and as stated in first PR comment, it's expected |
||
| import com.atlassian.performance.tools.report.api.junit.FailedAssertionJUnitReport | ||
| import com.atlassian.performance.tools.report.api.junit.JUnitReport | ||
| import com.atlassian.performance.tools.report.api.junit.SuccessfulJUnitReport | ||
|
|
@@ -70,14 +70,16 @@ class RelativeNonparametricPerformanceJudge private constructor( | |
| report = FailedAssertionJUnitReport(reportName, "No action $label results for $experimentCohort"), | ||
| action = action | ||
| ) | ||
| val test = ShiftedDistributionRegressionTest(baseline, experiment, mwAlpha = significance, ksAlpha = 0.0) | ||
| // shifts are negated, because ShiftedDistributionRegressionTest is relative to experiment, instead of baseline | ||
mgrzaslewicz marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| val comparison = DistributionComparator.Builder(baseline, experiment) | ||
| .tolerance(toleranceRatio.toDouble()) | ||
| .build() | ||
| .compare() | ||
| val impact = LatencyImpact.Builder( | ||
| action, | ||
| -test.percentageShift, | ||
| reader.convertToDuration(-test.locationShift) | ||
| comparison.experimentRelativeChange, | ||
| reader.convertToDuration(comparison.experimentShift) | ||
| ) | ||
| .relevant(test.overcomesTolerance(toleranceRatio.toDouble())) | ||
| .relevant(comparison.hasImpact()) | ||
| .build() | ||
| impactHandlers.forEach { it.accept(impact) } | ||
| return if (impact.regression) { | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,6 +1,5 @@ | ||
| package com.atlassian.performance.tools.report | ||
|
|
||
| import com.atlassian.performance.tools.jiraactions.api.* | ||
| import com.atlassian.performance.tools.report.api.ShiftedDistributionRegressionTest | ||
| import com.atlassian.performance.tools.report.api.result.FakeResults | ||
| import com.atlassian.performance.tools.report.chart.Chart | ||
|
|
@@ -12,7 +11,7 @@ import org.apache.commons.math3.distribution.NormalDistribution | |
| import org.apache.commons.math3.distribution.NormalDistribution.DEFAULT_INVERSE_ABSOLUTE_ACCURACY | ||
| import org.apache.commons.math3.random.MersenneTwister | ||
| import org.assertj.core.api.Assertions.assertThat | ||
| import org.assertj.core.api.SoftAssertions.* | ||
| import org.assertj.core.api.SoftAssertions.assertSoftly | ||
| import org.assertj.core.data.Offset | ||
| import org.junit.Ignore | ||
| import org.junit.Test | ||
|
|
@@ -112,7 +111,7 @@ class ShiftedDistributionRegressionTestTest { | |
| } | ||
|
|
||
| /** | ||
| * In a 51% vs 49% case, small diffs should not dominate the big diffs. | ||
| * In a 51% slightly faster vs 49% much slower case, it should be a regression | ||
| */ | ||
| @Ignore("https://ecosystem.atlassian.net/browse/JPERF-1297") | ||
| @Test | ||
|
|
@@ -152,16 +151,39 @@ class ShiftedDistributionRegressionTestTest { | |
| } | ||
| } | ||
|
|
||
| @Test | ||
| @Ignore("percentageShift calculation is fixed in in DistributionComparator where Hodges-Lehmann estimator with pseudo-median is used") | ||
| fun shouldDetectImprovementWhenEveryPercentileBetter() { | ||
| // given | ||
| val baseline = | ||
| this.javaClass.getResource("/real-results/view issue 9.17.0 vs 10.0.0/baseline.csv").readText().lines() | ||
| .map { it.toDouble() }.toDoubleArray() | ||
| val experiment = | ||
| this.javaClass.getResource("/real-results/view issue 9.17.0 vs 10.0.0/experiment.csv").readText().lines() | ||
| .map { it.toDouble() }.toDoubleArray() | ||
| // when | ||
| val test = ShiftedDistributionRegressionTest(baseline, experiment) | ||
| // then | ||
| plotQuantiles(baseline, experiment) | ||
| assertSoftly { | ||
| it.assertThat(test.isExperimentRegressed(0.01)).`as`("isExperimentRegressed").isFalse() | ||
| it.assertThat(test.percentageShift).`as`("").`as`("percentageShift").isEqualTo(0.03941908713692943) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Double |
||
| it.assertThat(test.locationShift).`as`("").`as`("locationShift").isEqualTo(20.0) | ||
| it.assertThat(test.overcomesTolerance(0.01)).`as`("overcomesTolerance").isTrue() | ||
| } | ||
| } | ||
|
|
||
|
|
||
| private fun plotQuantiles( | ||
| baseline: DoubleArray, | ||
| experiment: DoubleArray | ||
| ) { | ||
| val chart = Chart(listOf( | ||
| chartLine(baseline, "baseline"), | ||
| chartLine(experiment, "experiment") | ||
| )) | ||
| val chart = Chart( | ||
| listOf( | ||
| chartLine(baseline, "baseline"), | ||
| chartLine(experiment, "experiment") | ||
| ) | ||
| ) | ||
| val htmlFile = Files.createTempFile("kebab", ".html") | ||
| .also { println("Distribution comparison at $it") } | ||
| DistributionComparison(GitRepo.findFromCurrentDirectory()).render(chart, htmlFile) | ||
|
|
@@ -174,7 +196,6 @@ class ShiftedDistributionRegressionTestTest { | |
| yAxisId = "latency-axis" | ||
| ) | ||
|
|
||
| @Ignore("Known bug: https://ecosystem.atlassian.net/browse/JPERF-1188") | ||
dagguh marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| @Test | ||
| fun shouldSeeNoShiftAcrossTheSameResult() { | ||
| val result = FakeResults.fastResult | ||
|
|
||

Uh oh!
There was an error while loading. Please reload this page.