Создание линии тренда из набора данных SQL
приведенный ниже код возвращает количество разрешенных билетов и количество открытых билетов за период (период гггг,WW), возвращающийся на определенное количество дней. Например, если @NoOfDays равно 7:
разрешено / открыто | неделя | год / период
56 | 30 | 13 | 2012 | 2012, 13
237 | 222 | 14 | 2012 | 2012, 14
"разрешено" и "открыто" отображаются на линиях (y) в течение периода (x). Я хотел бы добавить еще один столбец "trend", который вернет число, которое при графике за период будет линией тренда (простая линейная регрессия). Я do хотите использовать оба набора значений в качестве источников данных для тренда.
это код, который у меня есть:
SELECT a.resolved, b.opened, a.weekClosed AS week, a.yearClosed AS year,
CAST(a.yearClosed as varchar(5)) + ', ' + CAST(a.weekClosed as varchar(5)) AS period
FROM
(SELECT TOP (100) PERCENT COUNT(DISTINCT TicketNbr) AS resolved, { fn WEEK(date_closed) } AS weekClosed, { fn YEAR(date_closed) } AS yearClosed
FROM v_rpt_Service
WHERE (date_closed >= DateAdd(Day, DateDiff(Day, 0, GetDate()) - @NoOfDays, 0))
GROUP BY { fn WEEK(date_closed) }, { fn YEAR(date_closed) }) AS a
LEFT OUTER JOIN
(SELECT TOP (100) PERCENT COUNT(DISTINCT TicketNbr) AS opened, { fn WEEK(date_entered) } AS weekEntered, { fn YEAR(date_entered)
} AS yearEntered
FROM v_rpt_Service AS v_rpt_Service_1
WHERE (date_entered > = DateAdd(Day, DateDiff(Day, 0, GetDate()) - @NoOfDays, 0))
GROUP BY { fn WEEK(date_entered) }, { fn YEAR(date_entered) }) AS b ON a.weekClosed = b.weekEntered AND a.yearClosed = b.yearEntered
ORDER BY year, week
Edit:
согласно serc.carleton.edu/files/mathyouneed/best_fit_line_dividing.pdf, кажется, я хочу разбить данные пополам, а затем вычислить среднее значение. Затем Мне нужно найти наиболее подходящую линию и использовать наклон и y-Перехват для вычисления значений, необходимых для возврата в "тренд", используя y = mx + b
?
Я знаю, что это очень возможно в SQL, однако программа, в которую я вставляю SQL, имеет ограничения на то, что я могу сделать.
красные и синие точки-это числа, которые я возвращаю сейчас (открыт и разрешен). Мне нужно вернуть значение для каждого периода в "тренде", чтобы создать фиолетовую линию. (это изображение гипотетически)
3 ответов
меня заинтересовала проблема, и я обнаружил, что лучший способ grok сложного запроса-переформатировать его, используя мой собственный стиль и соглашения. Я применил их к вашему решению, и результат ниже. Я понятия не имею, будет ли это иметь какую-либо ценность для вас...
- было несколько битов кода, которые я не считаю частью синтаксиса MS T-SQL, например
({fn xxx }
и . - этот код компилируется, но я не могу его запустить, поскольку у меня нет данных таблица правильно настроена.
- Я сделал множество изменений в кодировке, которые потребуют серьезных объяснений, и я собираюсь пропустить большую часть этого. Добавьте комментарий, если хотите что-то объяснить.
- Я бросил много пробелов. Разница между разборчивыми и неразборчивыми кодами часто заключается только в восприятии и чувствительности созерцателя, и вы можете возненавидеть мои условности.
- Не уверен, каким должен быть конечный результирующий набор (т. е. какие столбцы получают вернулся)
некоторые дополнительные примечания:
- этот запрос не получит элементы, введенные в неделю, если никакие элементы не были также закрыты на этой неделе
- недели могут быть частичными, например, не все семь дней могут присутствовать (отрегулируйте @Interval на всегда включая полные недели - но как насчет нечетных интервалов?)
- умножьте значения count (*) на 1.0, чтобы преобразовать их в поплавки раньше (избегает усечения cast и integer math)
- сделал это cte чтобы более ранние формулы были заменены символами в более поздних формулах (в этот момент все стало намного более разборчивым)
так вот что я придумал:
;WITH cte as (
select
c.period
,resolved_half1
,resolved_half2
,opened_half1
,opened_half2
,row = row_number() over(order by c.yearClosed, c.weekClosed)
,y1 = ((SUM(resolved_half1) + SUM(opened_half1)) - (SUM(resolved_half2) + SUM(opened_half2))) / ((count(resolved_half1) + count(opened_half1)) / 2)
,y2 = ((SUM(resolved_half2) + SUM(opened_half2)) / (count(resolved_half2) + COUNT (opened_half2)))
,x1 = ((count(c.period)) / 4)
,x2 = (((count(c.period)) / 4) * 3)
from (select
a.yearclosed
,a.weekClosed
,a.resolved_half1
,b.yearEntered
,b.weekEntered
,b.opened_half1
,cast(a.yearClosed as varchar(5)) + ', ' + cast(a.weekClosed as varchar(5)) period
from (-- Number of items per week that closed within @Interval
select
count(distinct TicketNbr) * 1.0 resolved_half1
,datepart(wk, date_closed) weekClosed
,year(date_closed) yearClosed
from v_rpt_Service
where date_closed >= @FullInterval
group by
datepart(wk, date_closed)
,year(date_closed) ) a
left outer join (-- Number of items per week that were entered within @Interval
select
count(distinct TicketNbr) * 1.0 opened_half1
,datepart(wk, date_entered) weekEntered
,year(date_entered) yearEntered
from v_rpt_Service
where date_entered >= @FullInterval
group by
datepart(wk, date_entered)
,year(date_entered) ) b
on a.weekClosed = b.weekEntered
and a.yearClosed = b.yearEntered) c
left outer join (select
d.yearclosed
,d.weekClosed
,d.resolved_half2
,e.yearEntered
,e.weekEntered
,e.opened_half2
,cast(yearClosed as varchar(5)) + ', ' + cast(weekClosed as varchar(5)) period
from (select
count(distinct TicketNbr) * 1.0 resolved_half2
,datepart(wk, date_closed) weekClosed
,year(date_closed) yearClosed
from v_rpt_Service
where date_closed >= @HalfInterval
group by
datepart(wk, date_closed)
,year(date_closed) ) d
left outer join (select
count(distinct TicketNbr) * 1.0 opened_half2
,datepart(wk, date_entered) weekEntered
,year(date_entered) yearEntered
from v_rpt_Service
where date_entered >= @HalfInterval
group by
datepart(wk, date_entered)
,year(date_entered) ) e
on d.weekClosed = e.weekEntered
and d.yearClosed = e.yearEntered ) f
on c.period = f.period
group by
c.period
,resolved_half1
,resolved_half2
,opened_half1
,opened_half2
,c.yearClosed
,c.weekClosed
)
SELECT
row
,Period
,x1
,y1
,x2
,y2
,m = ((y1 - y2) / (x1 - x2))
,b = (y2 - (((y1 - y2) / (x1 - x2)) * x2))
,trend = ((((y1 - y2) / (x1 - x2)) * (row)) + (y2 - (((y1 - y2) / (x1 - x2)) * x2)))
from cte
order by row
в качестве добавления весь подзапрос " c "может быть заменен чем-то вроде следующего, а" f " - слегка измененной версией. Лучшая или худшая производительность зависит от размера таблицы, индексирования и других невесомостей.
select
datepart(wk, date_closed) weekClosed
,year(date_closed) yearClosed
,count (distinct case
when date_closed >= @FullInterval then TicketNbr
else null
end) resolved_half1
,count (distinct case
when date_entered >= @FullInterval then TicketNbr
else null
end) opened_half1
from v_rpt_Service
where date_closed >= @FullInterval
or date_entered >= @FullInterval
group by
datepart(wk, date_closed)
,year(date_closed)
Я понял. Я разделил данные на несколько производных таблиц и подзапросов, по существу разделив данные пополам. Это мои формулы для получения каждого значения:
*(each row is a week)*
y1 = average of data first half
y2 = average of data second half
x1 = 1/4 of number of weeks
x2 = 3/4 of number of weeks
m = (y1-y2)/(x1-x2)
b = y2 - (m * x2)
trend = (m * row_number) + b
и вот мой (очень грязный) SQL-код:
SELECT resolved_half1,resolved_half2,opened_half1,opened_half2, c.period,
((SUM (resolved_half1) OVER () + SUM(opened_half1) OVER ()) - (SUM(resolved_half2) OVER () + SUM(opened_half2) OVER ())) / ((COUNT(resolved_half1) OVER () + COUNT(opened_half1) OVER ()) / 2) as y1,
((SUM(resolved_half2) OVER () + SUM(opened_half2) OVER ()) / (COUNT(resolved_half2) OVER () + COUNT (opened_half2) OVER ())) as y2,
((COUNT(c.period) OVER ()) / 4) as x1,
(((COUNT(c.period) OVER ()) / 4) * 3) as x2,
((CAST(((SUM (resolved_half1) OVER () + SUM(opened_half1) OVER ()) - (SUM(resolved_half2) OVER () + SUM(opened_half2) OVER ())) / ((COUNT(resolved_half1) OVER () + COUNT(opened_half1) OVER ()) / 2) as float) - CAST(((SUM(resolved_half2) OVER () + SUM(opened_half2) OVER ()) / (COUNT(resolved_half2) OVER () + COUNT (opened_half2) OVER ())) as float)) / (CAST(((COUNT(c.period) OVER ()) / 4) as float) - CAST( (((COUNT(c.period) OVER ()) / 4) * 3) as float))) as m,
(CAST(((SUM(resolved_half2) OVER () + SUM(opened_half2) OVER ()) / (COUNT(resolved_half2) OVER () + COUNT (opened_half2) OVER ())) as float) - (((CAST(((SUM (resolved_half1) OVER () + SUM(opened_half1) OVER ()) - (SUM(resolved_half2) OVER () + SUM(opened_half2) OVER ())) / ((COUNT(resolved_half1) OVER () + COUNT(opened_half1) OVER ()) / 2) as float) - CAST(((SUM(resolved_half2) OVER () + SUM(opened_half2) OVER ()) / (COUNT(resolved_half2) OVER () + COUNT (opened_half2) OVER ())) as float)) / (CAST(((COUNT(c.period) OVER ()) / 4) as float) - CAST( (((COUNT(c.period) OVER ()) / 4) * 3) as float))) * (((COUNT(c.period) OVER ()) / 4) * 3))) as b,
((((CAST(((SUM (resolved_half1) OVER () + SUM(opened_half1) OVER ()) - (SUM(resolved_half2) OVER () + SUM(opened_half2) OVER ())) / ((COUNT(resolved_half1) OVER () + COUNT(opened_half1) OVER ()) / 2) as float) - CAST(((SUM(resolved_half2) OVER () + SUM(opened_half2) OVER ()) / (COUNT(resolved_half2) OVER () + COUNT (opened_half2) OVER ())) as float)) / (CAST(((COUNT(c.period) OVER ()) / 4) as float) - CAST( (((COUNT(c.period) OVER ()) / 4) * 3) as float))) * (ROW_NUMBER() OVER(ORDER BY c.yearClosed,c.weekClosed))) + (CAST(((SUM(resolved_half2) OVER () + SUM(opened_half2) OVER ()) / (COUNT(resolved_half2) OVER () + COUNT (opened_half2) OVER ())) as float) - (((CAST(((SUM (resolved_half1) OVER () + SUM(opened_half1) OVER ()) - (SUM(resolved_half2) OVER () + SUM(opened_half2) OVER ())) / ((COUNT(resolved_half1) OVER () + COUNT(opened_half1) OVER ()) / 2) as float) - CAST(((SUM(resolved_half2) OVER () + SUM(opened_half2) OVER ()) / (COUNT(resolved_half2) OVER () + COUNT (opened_half2) OVER ())) as float)) / (CAST(((COUNT(c.period) OVER ()) / 4) as float) - CAST( (((COUNT(c.period) OVER ()) / 4) * 3) as float))) * (((COUNT(c.period) OVER ()) / 4) * 3)))) as trend,
ROW_NUMBER() OVER(ORDER BY c.yearClosed,c.weekClosed) as row
FROM
(SELECT *, CAST(yearClosed as varchar(5)) + ', ' + CAST(weekClosed as varchar(5)) AS period
FROM (SELECT TOP (100) PERCENT COUNT(DISTINCT TicketNbr) AS resolved_half1, { fn WEEK(date_closed) } AS weekClosed, { fn YEAR(date_closed) } AS yearClosed
FROM v_rpt_Service
WHERE (date_closed >= DateAdd(Day, DateDiff(Day, 0, GetDate()) - (180), 0))
GROUP BY { fn WEEK(date_closed) }, { fn YEAR(date_closed) }) AS a
LEFT OUTER JOIN
(SELECT TOP (100) PERCENT COUNT(DISTINCT TicketNbr) AS opened_half1, { fn WEEK(date_entered) } AS weekEntered, { fn YEAR(date_entered)
FROM v_rpt_Service AS v_rpt_Service_1
WHERE (date_entered > = DateAdd(Day, DateDiff(Day, 0, GetDate()) - (180), 0))
GROUP BY { fn WEEK(date_entered) }, { fn YEAR(date_entered) }) AS b ON a.weekClosed = b.weekEntered AND a.yearClosed = b.yearEntered) as c
LEFT OUTER JOIN
(SELECT *, CAST(yearClosed as varchar(5)) + ', ' + CAST(weekClosed as varchar(5)) AS period
FROM (SELECT TOP (100) PERCENT COUNT(DISTINCT TicketNbr) AS resolved_half2, { fn WEEK(date_closed) } AS weekClosed, { fn YEAR(date_closed) } AS yearClosed
FROM v_rpt_Service
WHERE (date_closed >= DateAdd(Day, DateDiff(Day, 0, GetDate()) - (180 / 2), 0))
GROUP BY { fn WEEK(date_closed) }, { fn YEAR(date_closed) }) AS d
LEFT OUTER JOIN
(SELECT TOP (100) PERCENT COUNT(DISTINCT TicketNbr) AS opened_half2, { fn WEEK(date_entered) } AS weekEntered, { fn YEAR(date_entered)} AS yearEntered
FROM v_rpt_Service AS v_rpt_Service_1
WHERE (date_entered > = DateAdd(Day, DateDiff(Day, 0, GetDate()) - (180 / 2), 0))
GROUP BY { fn WEEK(date_entered) }, { fn YEAR(date_entered) }) AS e ON d.weekClosed = e.weekEntered AND d.yearClosed = e.yearEntered
) as f ON c.yearClosed = f.yearClosed AND c.weekClosed = f.weekClosed AND c.weekEntered = f.weekEntered AND c.yearEntered = f.yearEntered AND c.period = f.period
GROUP BY c.period, resolved_half1,resolved_half2,opened_half1,opened_half2,c.yearClosed,c.weekClosed
ORDER BY row
этот код использует жестко закодированное значение 180 дней. Мне все еще нужно иметь возможность использовать varibale для выбора количества дней (без получения деления на 0 ошибок), и код действительно нужно очистить. Если кто-то может это сделать две вещи для меня (я не лучший в SQL), щедрость принадлежит им.
изображение:
Я считаю, что это сделает трюк-если не опубликовать некоторые фактические данные образца, и я посмотрю, смогу ли я настроить его, чтобы исправить это:
DECLARE @noOfDays INT
SET @noofdays = 180
;WITH tickets AS
(
SELECT DISTINCT
DATENAME(YEAR,date_closed) + RIGHT('000' + CAST(DATEPART(WEEK,date_closed) AS VARCHAR(5)),3) as Period
,ticket_nbr
,1 as ticket_type --resolved
FROM v_rpt_Service
WHERE (date_closed >= DateAdd(Day, DateDiff(Day, 0, GetDate()) - @NoOfDays, 0))
UNION ALL
SELECT DISTINCT
DATENAME(YEAR,date_closed) + RIGHT('000' + CAST(DATEPART(WEEK,date_closed) AS VARCHAR(5)),3) as Period
,ticket_nbr
,0 as ticket_type --opened
FROM v_rpt_Service
WHERE (date_entered > = DateAdd(Day, DateDiff(Day, 0, GetDate()) - @NoOfDays, 0))
)
,tickets2 AS
(
SELECT
Period
,SUM(CASE WHEN ticket_type = 0 THEN 1 ELSE 0 END) as opened
,SUM(CASE WHEN ticket_type = 1 THEN 1 ELSE 0 END) as closed
FROM tickets
GROUP BY
Period
)
,tickets3 AS
(
SELECT
Period
,row_number() OVER (ORDER BY period ASC) as row
,opened
,closed
,COUNT(period) OVER() as base
,SUM(opened) OVER () as [Sumopened]
,SUM(opened * opened) OVER () as [Sumopened^2]
,SUM(opened * closed) OVER () as [Sumopenedclosed]
,SUM(closed) OVER () as [Sumclosed]
,SUM(closed * closed) OVER () as [Sumclosed^2]
,SUM(opened * closed) OVER () * COUNT(period) OVER () AS [nSumopenedclosed]
,SUM(opened) OVER () * SUM(closed) OVER () AS [Sumopened*Sumclosed]
,SUM(opened * opened) OVER () * COUNT(period) OVER () AS [nSumopened^2]
,SUM(opened) OVER () * SUM(opened) OVER () as [Sumopened*Sumopened]
FROM tickets2
)
--Formula for linear regression is Y = A + BX
SELECT
period
,opened
,closed
,((1.0 / base) * [Sumclosed]) -
([Sumopenedclosed] - ([Sumopened*Sumclosed] / base)) / ([Sumopened^2] - ([Sumopened*Sumopened] / base)) *((1.0 / base) * [Sumopened])
+ row * ([Sumopenedclosed] - ([Sumopened*Sumclosed] / base)) / ([Sumopened^2] - ([Sumopened*Sumopened] / base))
AS trend_point
,((1.0 / base) * [Sumclosed]) -
([Sumopenedclosed] - ([Sumopened*Sumclosed] / base)) / ([Sumopened^2] - ([Sumopened*Sumopened] / base)) *((1.0 / base) * [Sumopened]) AS A
,([Sumopenedclosed] - ([Sumopened*Sumclosed] / base)) / ([Sumopened^2] - ([Sumopened*Sumopened] / base)) as B
from tickets3