lucian-ioan / public-notes

Apache License 2.0
0 stars 0 forks source link

Figure out what data quality check are best suited for elastic/integrations #4

Open lucian-ioan opened 6 months ago

lucian-ioan commented 6 months ago

Sample code ideas for identifying missing data and filling it with the average of other data points, also for detecting spikes in data:

package main

import (
    "fmt"
    "log"
    "math"
    "github.com/gonum/stat"
)

func checkMissingValues(data []float64) bool {
    for _, value := range data {
        if math.IsNaN(value) || math.IsInf(value, 0) {
            return true
        }
    }
    return false
}

func detectSpikes(data []float64, threshold float64) []int {
    spikes := make([]int, 0)
    mean, stdDev := stat.MeanStdDev(data, nil)

    for i, value := range data {
        zScore := (value - mean) / stdDev
        if math.Abs(zScore) > threshold {
            spikes = append(spikes, i)
        }
    }

    return spikes
}

func main() {
    // Sample data
    latencyData := []float64{10.5, 15.2, 12.8, 30.0, 14.6, 18.3, 40.0, 13.7}

    // Check for missing values
    if checkMissingValues(latencyData) {
        log.Println("Warning: Missing values found in the data.")
    }

    // Data cleaning example: Remove NaN and Inf values
    cleanedData := make([]float64, 0)
    for _, value := range latencyData {
        if !math.IsNaN(value) && !math.IsInf(value, 0) {
            cleanedData = append(cleanedData, value)
        }
    }

    // Detect spikes in the cleaned data
    spikes := detectSpikes(cleanedData, 2.0) // Adjust the threshold as needed

    // Print detected spikes
    fmt.Printf("Detected spikes at indices: %v\n", spikes)

    // Calculate average latency
    meanLatency := stat.Mean(cleanedData, nil)

    fmt.Printf("Cleaned data: %v\n", cleanedData)
    fmt.Printf("Average Latency after cleaning: %.2f\n", meanLatency)
}