Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix sla report inconsistencies #560

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 10 additions & 1 deletion schema/mysql/schema.sql
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ BEGIN
DECLARE active_downtimes int unsigned;
DECLARE problem_time bigint unsigned;
DECLARE total_time bigint unsigned;
DECLARE rowCounts int unsigned;
DECLARE done int;
DECLARE cur CURSOR FOR
(
Expand Down Expand Up @@ -69,6 +70,8 @@ BEGIN
AND ((in_service_id IS NULL AND s.service_id IS NULL) OR s.service_id = in_service_id)
AND s.event_time > in_start_time
AND s.event_time < in_end_time
AND s.hard_state IS NOT NULL
AND s.previous_hard_state IS NOT NULL
Comment on lines +73 to +74
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Don't these columns already have a NOT NULL constraint?

) UNION ALL (
-- end event to keep loop simple, values are not used
SELECT
Expand Down Expand Up @@ -130,6 +133,7 @@ BEGIN

SET done = 0;
OPEN cur;
SELECT FOUND_ROWS() INTO rowCounts;
read_loop: LOOP
FETCH cur INTO row_event_time, row_event_type, row_event_prio, row_hard_state, row_previous_hard_state;
IF done THEN
Expand All @@ -156,7 +160,12 @@ BEGIN
END LOOP;
CLOSE cur;

SET result = 100 * (total_time - problem_time) / total_time;
-- row count "1" because of the faked ending result used for the
-- cursor loop, whose result set is never used.
IF rowCounts > 1 THEN
Comment on lines +163 to +165
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This would also count downtime rows returned by that query which I doubt you want. But also if this would just count the state rows, this makes the assumption that if there's no event in the interval, there's no valid result to report. Problem with this is that if nothing happens on a checkable, there is no history. So for something that's 100% up, had no downtimes, this would now no longer return a value.

SET result = 100 * (total_time - problem_time) / total_time;
END IF; -- else no data available to be reported

RETURN result;
END//
DELIMITER ;
Expand Down
13 changes: 12 additions & 1 deletion schema/pgsql/schema.sql
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,9 @@ DECLARE
active_downtimes uint := 0;
problem_time biguint := 0;
total_time biguint;
rowCounts uint := 0;
row record;
result decimal(7, 4);
BEGIN
IF in_end_time <= in_start_time THEN
RAISE 'end time must be greater than start time';
Expand Down Expand Up @@ -127,6 +129,8 @@ BEGIN
AND ((in_service_id IS NULL AND s.service_id IS NULL) OR s.service_id = in_service_id)
AND s.event_time > in_start_time
AND s.event_time < in_end_time
AND s.hard_state IS NOT NULL
AND s.previous_hard_state IS NOT NULL
) UNION ALL (
-- end event to keep loop simple, values are not used
SELECT
Expand All @@ -147,6 +151,7 @@ BEGIN
problem_time := problem_time + row.event_time - last_event_time;
END IF;

rowCounts := rowCounts + 1;
last_event_time := row.event_time;
IF row.event_type = 'state_change' THEN
last_hard_state := row.hard_state;
Expand All @@ -157,7 +162,13 @@ BEGIN
END IF;
END LOOP;

RETURN 100 * (total_time - problem_time) / total_time;
-- row count "1" because of the faked ending result used for the
-- cursor loop, whose result set is never used.
IF rowCounts > 1 THEN
result := 100 * (total_time - problem_time) / total_time;
END IF; -- else no data available to be reported

RETURN result;
END;
$$;

Expand Down
39 changes: 20 additions & 19 deletions tests/sql/sla_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package sql_test

import (
"crypto/rand"
"database/sql"
"database/sql/driver"
"fmt"
"github.com/go-sql-driver/mysql"
Expand All @@ -22,7 +23,7 @@ func TestSla(t *testing.T) {
Events []SlaHistoryEvent
Start uint64
End uint64
Expected float64
Expected sql.NullFloat64
}

tests := []TestData{{
Expand All @@ -31,7 +32,7 @@ func TestSla(t *testing.T) {
Events: nil,
Start: 1000,
End: 2000,
Expected: 100.0,
Expected: sql.NullFloat64{},
}, {
Name: "MultipleStateChanges",
// Some flapping, test that all changes are considered.
Expand All @@ -46,7 +47,7 @@ func TestSla(t *testing.T) {
},
Start: 1000,
End: 2000,
Expected: 60.0,
Expected: sql.NullFloat64{Float64: 60.0},
}, {
Name: "OverlappingDowntimesAndProblems",
// SLA should be 90%:
Expand All @@ -65,7 +66,7 @@ func TestSla(t *testing.T) {
},
Start: 1000,
End: 2000,
Expected: 90.0,
Expected: sql.NullFloat64{Float64: 90.0},
}, {
Name: "CriticalBeforeInterval",
// If there is no event within the SLA interval, the last state from before the interval should be used.
Expand All @@ -74,7 +75,7 @@ func TestSla(t *testing.T) {
},
Start: 1000,
End: 2000,
Expected: 0.0,
Expected: sql.NullFloat64{Float64: 0.0},
}, {
Name: "CriticalBeforeIntervalWithDowntime",
// State change and downtime start from before the SLA interval should be considered if still relevant.
Expand All @@ -84,7 +85,7 @@ func TestSla(t *testing.T) {
},
Start: 1000,
End: 2000,
Expected: 80.0,
Expected: sql.NullFloat64{Float64: 80.0},
}, {
Name: "CriticalBeforeIntervalWithOverlappingDowntimes",
// Test that overlapping downtimes are properly accounted for.
Expand All @@ -99,7 +100,7 @@ func TestSla(t *testing.T) {
},
Start: 1000,
End: 2000,
Expected: 80.0,
Expected: sql.NullFloat64{Float64: 80.0},
}, {
Name: "FallbackToPreviousState",
// If there is no state event from before the SLA interval, the previous hard state from the first event
Expand All @@ -109,7 +110,7 @@ func TestSla(t *testing.T) {
},
Start: 1000,
End: 2000,
Expected: 80.0,
Expected: sql.NullFloat64{Float64: 80.0},
}, {
Name: "FallbackToCurrentState",
// If there are no state history events, the current state of the checkable should be used.
Expand All @@ -118,7 +119,7 @@ func TestSla(t *testing.T) {
},
Start: 1000,
End: 2000,
Expected: 0.0,
Expected: sql.NullFloat64{Float64: 0.0},
}, {
Name: "PreferInitialStateFromBeforeOverLaterState",
// The previous_hard_state should only be used as a fallback when there is no event from before the
Expand All @@ -129,7 +130,7 @@ func TestSla(t *testing.T) {
},
Start: 1000,
End: 2000,
Expected: 80.0,
Expected: sql.NullFloat64{Float64: 80.0},
}, {
Name: "PreferInitialStateFromBeforeOverCurrentState",
// The current state should only be used as a fallback when there is no state history event.
Expand All @@ -140,7 +141,7 @@ func TestSla(t *testing.T) {
},
Start: 1000,
End: 2000,
Expected: 0.0,
Expected: sql.NullFloat64{Float64: 0.0},
}, {
Name: "PreferLaterStateOverCurrentState",
// The current state should only be used as a fallback when there is no state history event.
Expand All @@ -151,7 +152,7 @@ func TestSla(t *testing.T) {
},
Start: 1000,
End: 2000,
Expected: 80.0,
Expected: sql.NullFloat64{Float64: 80.0},
}, {
Name: "InitialUnknownReducesTotalTime",
Events: []SlaHistoryEvent{
Expand All @@ -161,7 +162,7 @@ func TestSla(t *testing.T) {
},
Start: 1000,
End: 2000,
Expected: 60,
Expected: sql.NullFloat64{Float64: 60},
}, {
Name: "IntermediateUnknownReducesTotalTime",
Events: []SlaHistoryEvent{
Expand All @@ -173,7 +174,7 @@ func TestSla(t *testing.T) {
},
Start: 1000,
End: 2000,
Expected: 60,
Expected: sql.NullFloat64{Float64: 60},
}}

for _, test := range tests {
Expand Down Expand Up @@ -226,14 +227,14 @@ func TestSla(t *testing.T) {
})
}

func execSqlSlaFunc(db *sqlx.DB, m *SlaHistoryMeta, start uint64, end uint64) (float64, error) {
var result float64
func execSqlSlaFunc(db *sqlx.DB, m *SlaHistoryMeta, start uint64, end uint64) (sql.NullFloat64, error) {
var result sql.NullFloat64
err := db.Get(&result, db.Rebind("SELECT get_sla_ok_percent(?, ?, ?, ?)"),
m.HostId, m.ServiceId, start, end)
return result, err
}

func testSla(t *testing.T, db *sqlx.DB, events []SlaHistoryEvent, start uint64, end uint64, expected float64, msg string) {
func testSla(t *testing.T, db *sqlx.DB, events []SlaHistoryEvent, start uint64, end uint64, expected sql.NullFloat64, msg string) {
t.Run("Host", func(t *testing.T) {
testSlaWithObjectType(t, db, events, false, start, end, expected, msg)
})
Expand All @@ -243,7 +244,7 @@ func testSla(t *testing.T, db *sqlx.DB, events []SlaHistoryEvent, start uint64,
}

func testSlaWithObjectType(t *testing.T, db *sqlx.DB,
events []SlaHistoryEvent, service bool, start uint64, end uint64, expected float64, msg string,
events []SlaHistoryEvent, service bool, start uint64, end uint64, expected sql.NullFloat64, msg string,
) {
makeId := func() []byte {
id := make([]byte, 20)
Expand Down Expand Up @@ -271,7 +272,7 @@ func testSlaWithObjectType(t *testing.T, db *sqlx.DB,

r, err := execSqlSlaFunc(db, &meta, start, end)
require.NoError(t, err, "SLA query should not fail")
assert.Equal(t, expected, r, msg)
assert.Equal(t, expected.Float64, r.Float64, msg)
}

type SlaHistoryMeta struct {
Expand Down