X Y
--------
1 11
2 4
3 15
4 18
5 7
6 13
7 26
8 28
9 18
10 30
SELECT (Avg(X * Y) - Avg(X) * Avg(Y)) ^ 2 / (VarP(X) * VarP(Y)) AS RSquared,
(Avg(X * Y) - Avg(X) * Avg(Y)) / VarP(X) AS RegressCoeff,
Avg(Y) - ((Avg(X * Y) - Avg(X) * Avg(Y)) / VarP(X)) * Avg(X) AS Intercept,
((Count(X) / (Count(X) - 2)) * (VarP(Y) - (Avg(X * Y) - Avg(X) * Avg(Y)) ^ 2 / VarP(X))) ^ 0.5 AS SE_Resid,
((1 / (VarP(X) * (Count(X) - 2))) * (VarP(Y) - (Avg(X * Y) - Avg(X) * Avg(Y)) ^ 2 / VarP(X))) ^ 0.5 AS SE_XCoeff,
((1 / (Count(X) - 2)) * (VarP(Y) - ((Avg(X * Y) - Avg(X) * Avg(Y)) ^ 2 / VarP(X))) * ((VarP(X) + Avg(X) ^ 2) / VarP(X))) ^ 0.5 AS SE_Intercept,
((Avg(X * Y) - Avg(X) * Avg(Y)) / VarP(X)) / (((1 / (VarP(X) * (Count(X) - 2))) * (VarP(Y) - (Avg(X * Y) - Avg(X) * Avg(Y)) ^ 2 / VarP(X))) ^ 0.5) AS T_XCoeff,
(Avg(Y) - ((Avg(X * Y) - Avg(X) * Avg(Y)) / VarP(X)) * Avg(X)) / (((1 / (Count(X) - 2)) * (VarP(Y) - ((Avg(X * Y) - Avg(X) * Avg(Y)) ^ 2 / VarP(X))) * ((VarP(X) + Avg(X) ^ 2) / VarP(X))) ^ 0.5) AS T_Intercept
FROM Set1;
By Code:
SELECT Code,
(Avg(X * Y) - Avg(X) * Avg(Y)) ^ 2 / (VarP(X) * VarP(Y)) AS RSquared,
(Avg(X * Y) - Avg(X) * Avg(Y)) / VarP(X) AS RegressCoeff,
Avg(Y) - ((Avg(X * Y) - Avg(X) * Avg(Y)) / VarP(X)) * Avg(X) AS Intercept,
((Count(X) / (Count(X) - 2)) * (VarP(Y) - (Avg(X * Y) - Avg(X) * Avg(Y)) ^ 2 / VarP(X))) ^ 0.5 AS SE_Resid,
((1 / (VarP(X) * (Count(X) - 2))) * (VarP(Y) - (Avg(X * Y) - Avg(X) * Avg(Y)) ^ 2 / VarP(X))) ^ 0.5 AS SE_XCoeff,
((1 / (Count(X) - 2)) * (VarP(Y) - ((Avg(X * Y) - Avg(X) * Avg(Y)) ^ 2 / VarP(X))) * ((VarP(X) + Avg(X) ^ 2) / VarP(X))) ^ 0.5 AS SE_Intercept,
((Avg(X * Y) - Avg(X) * Avg(Y)) / VarP(X)) / (((1 / (VarP(X) * (Count(X) - 2))) * (VarP(Y) - (Avg(X * Y) - Avg(X) * Avg(Y)) ^ 2 / VarP(X))) ^ 0.5) AS T_XCoeff,
(Avg(Y) - ((Avg(X * Y) - Avg(X) * Avg(Y)) / VarP(X)) * Avg(X)) / (((1 / (Count(X) - 2)) * (VarP(Y) - ((Avg(X * Y) - Avg(X) * Avg(Y)) ^ 2 / VarP(X))) * ((VarP(X) + Avg(X) ^ 2) / VarP(X))) ^ 0.5) AS T_Intercept
FROM Set1
GROUP BY Code;
By Section:
SELECT Section,
(Avg(X * Y) - Avg(X) * Avg(Y)) ^ 2 / (VarP(X) * VarP(Y)) AS RSquared,
(Avg(X * Y) - Avg(X) * Avg(Y)) / VarP(X) AS RegressCoeff,
Avg(Y) - ((Avg(X * Y) - Avg(X) * Avg(Y)) / VarP(X)) * Avg(X) AS Intercept,
((Count(X) / (Count(X) - 2)) * (VarP(Y) - (Avg(X * Y) - Avg(X) * Avg(Y)) ^ 2 / VarP(X))) ^ 0.5 AS SE_Resid,
((1 / (VarP(X) * (Count(X) - 2))) * (VarP(Y) - (Avg(X * Y) - Avg(X) * Avg(Y)) ^ 2 / VarP(X))) ^ 0.5 AS SE_XCoeff,
((1 / (Count(X) - 2)) * (VarP(Y) - ((Avg(X * Y) - Avg(X) * Avg(Y)) ^ 2 / VarP(X))) * ((VarP(X) + Avg(X) ^ 2) / VarP(X))) ^ 0.5 AS SE_Intercept,
((Avg(X * Y) - Avg(X) * Avg(Y)) / VarP(X)) / (((1 / (VarP(X) * (Count(X) - 2))) * (VarP(Y) - (Avg(X * Y) - Avg(X) * Avg(Y)) ^ 2 / VarP(X))) ^ 0.5) AS T_XCoeff,
(Avg(Y) - ((Avg(X * Y) - Avg(X) * Avg(Y)) / VarP(X)) * Avg(X)) / (((1 / (Count(X) - 2)) * (VarP(Y) - ((Avg(X * Y) - Avg(X) * Avg(Y)) ^ 2 / VarP(X))) * ((VarP(X) + Avg(X) ^ 2) / VarP(X))) ^ 0.5) AS T_Intercept
FROM Set1
GROUP BY Section;
By Section and Code:
SELECT Section,
Code,
(Avg(X * Y) - Avg(X) * Avg(Y)) ^ 2 / (VarP(X) * VarP(Y)) AS RSquared,
(Avg(X * Y) - Avg(X) * Avg(Y)) / VarP(X) AS RegressCoeff,
Avg(Y) - ((Avg(X * Y) - Avg(X) * Avg(Y)) / VarP(X)) * Avg(X) AS Intercept,
((Count(X) / (Count(X) - 2)) * (VarP(Y) - (Avg(X * Y) - Avg(X) * Avg(Y)) ^ 2 / VarP(X))) ^ 0.5 AS SE_Resid,
((1 / (VarP(X) * (Count(X) - 2))) * (VarP(Y) - (Avg(X * Y) - Avg(X) * Avg(Y)) ^ 2 / VarP(X))) ^ 0.5 AS SE_XCoeff,
((1 / (Count(X) - 2)) * (VarP(Y) - ((Avg(X * Y) - Avg(X) * Avg(Y)) ^ 2 / VarP(X))) * ((VarP(X) + Avg(X) ^ 2) / VarP(X))) ^ 0.5 AS SE_Intercept,
((Avg(X * Y) - Avg(X) * Avg(Y)) / VarP(X)) / (((1 / (VarP(X) * (Count(X) - 2))) * (VarP(Y) - (Avg(X * Y) - Avg(X) * Avg(Y)) ^ 2 / VarP(X))) ^ 0.5) AS T_XCoeff,
(Avg(Y) - ((Avg(X * Y) - Avg(X) * Avg(Y)) / VarP(X)) * Avg(X)) / (((1 / (Count(X) - 2)) * (VarP(Y) - ((Avg(X * Y) - Avg(X) * Avg(Y)) ^ 2 / VarP(X))) * ((VarP(X) + Avg(X) ^ 2) / VarP(X))) ^ 0.5) AS T_Intercept
FROM Set1
GROUP BY Section, Code;
SELECT DSimpleRegress("X","Y","Set1","",1) AS RSquared,
DSimpleRegress("X","Y","Set1","",2) AS XCoeff,
DSimpleRegress("X","Y","Set1","",3) AS Intercept,
DSimpleRegress("X","Y","Set1","",4) AS SE_Resid,
DSimpleRegress("X","Y","Set1","",5) AS SE_XCoeff,
DSimpleRegress("X","Y","Set1","",6) AS SE_Intercept,
DSimpleRegress("X","Y","Set1","",7) AS T_XCoeff,
DSimpleRegress("X","Y","Set1","",8) AS T_Intercept;
By Code:
SELECT Set1.Code,
DSimpleRegress("X","Y","Set1","[Code] = '" & Code & "'",1) AS RSquared,
DSimpleRegress("X","Y","Set1","[Code] = '" & Code & "'",2) AS XCoeff,
DSimpleRegress("X","Y","Set1","[Code] = '" & Code & "'",3) AS Intercept,
DSimpleRegress("X","Y","Set1","[Code] = '" & Code & "'",4) AS SE_Resid,
DSimpleRegress("X","Y","Set1","[Code] = '" & Code & "'",5) AS SE_XCoeff,
DSimpleRegress("X","Y","Set1","[Code] = '" & Code & "'",6) AS SE_Intercept,
DSimpleRegress("X","Y","Set1","[Code] = '" & Code & "'",7) AS T_XCoeff,
DSimpleRegress("X","Y","Set1","[Code] = '" & Code & "'",8) AS T_Intercept
FROM Set1
GROUP BY Set1.Code;
By Section:
SELECT Section,
DSimpleRegress("X","Y","Set1","[Section] = '" & Section & "'",1) AS RSquared,
DSimpleRegress("X","Y","Set1","[Section] = '" & Section & "'",2) AS XCoeff,
DSimpleRegress("X","Y","Set1","[Section] = '" & Section & "'",3) AS Intercept,
DSimpleRegress("X","Y","Set1","[Section] = '" & Section & "'",4) AS SE_Resid,
DSimpleRegress("X","Y","Set1","[Section] = '" & Section & "'",5) AS SE_XCoeff,
DSimpleRegress("X","Y","Set1","[Section] = '" & Section & "'",6) AS SE_Intercept,
DSimpleRegress("X","Y","Set1","[Section] = '" & Section & "'",7) AS T_XCoeff,
DSimpleRegress("X","Y","Set1","[Section] = '" & Section & "'",8) AS T_Intercept
FROM Set1
GROUP BY Section;
By Section and Code:
SELECT Set1.Section, Set1.Code,
DSimpleRegress("X","Y","Set1","[Section] = '" & Section & "' And [Code] = '" & Code & "'",1) AS RSquared,
DSimpleRegress("X","Y","Set1","[Section] = '" & Section & "' And [Code] = '" & Code & "'",2) AS XCoeff,
DSimpleRegress("X","Y","Set1","[Section] = '" & Section & "' And [Code] = '" & Code & "'",3) AS Intercept,
DSimpleRegress("X","Y","Set1","[Section] = '" & Section & "' And [Code] = '" & Code & "'",4) AS SE_Resid,
DSimpleRegress("X","Y","Set1","[Section] = '" & Section & "' And [Code] = '" & Code & "'",5) AS SE_XCoeff,
DSimpleRegress("X","Y","Set1","[Section] = '" & Section & "' And [Code] = '" & Code & "'",6) AS SE_Intercept,
DSimpleRegress("X","Y","Set1","[Section] = '" & Section & "' And [Code] = '" & Code & "'",7) AS T_XCoeff,
DSimpleRegress("X","Y","Set1","[Section] = '" & Section & "' And [Code] = '" & Code & "'",8) AS T_Intercept
FROM Set1
GROUP BY Set1.Section, Set1.Code;
' Enumeration used for determining which regression stat(s) the DSimpleRegress function returns
Enum SimpleRegressStat
SimpleRegressStat_All = 0
SimpleRegressStat_RSquared = 1
SimpleRegressStat_XCoeff = 2
SimpleRegressStat_Intercept = 3
SimpleRegressStat_SE_Resid = 4
SimpleRegressStat_SE_XCoeff = 5
SimpleRegressStat_SE_Intercept = 6
SimpleRegressStat_T_XCoeff = 7
SimpleRegressStat_T_Intercept = 8
End Enum
Function DSimpleRegress(X_Column As String, Y_Column As String, Tbl As String, _
Optional Criteria As String = "", Optional Stat As Variant = "")
' Function by Patrick G. Matthews
' Feel free to use and distribute this code, so long as you credit authorship and indicate the
' URL where you found it
' This function calculates statistics for simple linear regression between a single independent
' variable (X_Column) and a dependent variable (Y_Column), and is intended for use in Microsoft
' Access. This function requires a reference to the Microsoft DAO library.
' Depending on the value of the Stat argument, this function returns a single regression
' statistic, such as the R squared or the X coefficient, or it returns an array of all
' available regression stats (for a list of available stats, please see the enumeration
' SimpleRegressStat)
' If either or both corresponding values in the paired X, Y data set are null, those records
' are ignored
' This function is labeled according to the domain aggregate function naming convention as it
' behaves similarly to the other domain aggregates
' X_Column is the independent variable
' Y_Column is the dependent variable
' Tbl is the source table or query for the data
' Criteria defines any filtering criteria you wish to apply to the data set. Be sure to enclose
' text items in single quotes and date values in the # date qualifiers
' Stat determines which regression statistic the function returns. For a full list of the valid
' values (NOT case sensitive) for the Stat argument, see the Select Case structure under
' the label DetermineMode
' For each of the arguments, I strongly recommend that you encase column and table names in
' square brackets. This is mandatory of the column/table name does not follow the usual rules
' for naming database objects
Static Last_X_Column As String
Static Last_Y_Column As String
Static Last_Tbl As String
Static Last_Criteria As String
Static Last_Runtime As Date
Static Result_RSquared As Variant
Static Result_XCoeff As Variant
Static Result_Intercept As Variant
Static Result_SE_Resid As Variant
Static Result_SE_XCoeff As Variant
Static Result_SE_Intercept As Variant
Static Result_T_XCoeff As Variant
Static Result_T_Intercept As Variant
Dim N As Long
Dim AvgX As Variant
Dim AvgY As Variant
Dim AvgXY As Variant
Dim VarPX As Variant
Dim VarPY As Variant
Dim Covar As Variant
Dim SQL As String
Dim rs As DAO.Recordset
Dim Mode As SimpleRegressStat
Dim Results(1 To 8) As Variant
Const ForceRefreshSeconds As Long = 30
On Error GoTo ErrHandler
DetermineMode:
' Determines whether a single regression stat is returned (and if so, which), or whether
' an array of all available stats is returned
Select Case LCase(Stat)
Case "1", "r squared", "rsquared", "r sq", "rsq", "r square", "rsquare", "r-squared", "r-squared", _
"r-sq", "r-sq", "r-square", "r-square"
Mode = SimpleRegressStat_RSquared
Case "2", "x", "x coefficient", "x coeff", "xcoeff", "coeff", "coefficient"
Mode = SimpleRegressStat_XCoeff
Case "3", "intercept", "constant"
Mode = SimpleRegressStat_Intercept
Case "4", "se model", "se regression", "se resid", "se residual", "se residuals", "std error model", _
"std error regression", "std error resid", "std error residual", "std error residuals", _
"standard error model", "standard error regression", "standard error resid", _
"standard error residual", "standard error residuals"
Mode = SimpleRegressStat_SE_Resid
Case "5", "se x", "se x coefficient", "se x coeff", "se xcoeff", "se coeff", "se coefficient", _
"std error x", "std error x coefficient", "std error x coeff", "std error xcoeff", _
"std error coeff", "std error coefficient", "standard error x", "standard error x coefficient", _
"standard error x coeff", "standard error xcoeff", "standard error coeff", _
"standard error coefficient"
Mode = SimpleRegressStat_SE_XCoeff
Case "6", "se intercept", "se constant", "std error intercept", "std error constant", _
"standard error intercept", "standard error constant"
Mode = SimpleRegressStat_SE_Intercept
Case "7", "t x", "t x coefficient", "t x coeff", "t xcoeff", "t coeff", "t coefficient"
Mode = SimpleRegressStat_T_XCoeff
Case "8", "t intercept", "t constant"
Mode = SimpleRegressStat_T_Intercept
Case Else
Mode = SimpleRegressStat_All
End Select
CalculateStats:
' Calculate the regression stats
' This function holds the regression stats in static variables, which retain their state
' between calls. If the values for the X_Column, Y_Column, Tbl, and Criteria arguments
' are the same as those for the last call, and if the seconds elapsed since the last
' call are less than what is specified in the ForceRefreshSeconds constant, then we can
' skip the calculations and go right to assigning the return value
If DateDiff("s", Last_Runtime, Now) >= ForceRefreshSeconds Or Last_X_Column <> X_Column Or _
Last_Y_Column <> Y_Column Or Last_Tbl <> Tbl Or Last_Criteria <> Criteria Then
' Initialize stats to null
Result_RSquared = Null
Result_XCoeff = Null
Result_Intercept = Null
Result_SE_Resid = Null
Result_SE_XCoeff = Null
Result_SE_Intercept = Null
Result_T_XCoeff = Null
Result_T_Intercept = Null
' All the regression stats can be calculated from the following six values: N, Avg(X), Avg(Y),
' Avg(X * Y), VarP(X), and VarP(Y). Use the following SQL statement to get these six values
SQL = "SELECT Count(1) AS N, Avg(" & X_Column & ") AS AvgX, Avg(" & Y_Column & ") AS AvgY, " & _
"Avg(" & X_Column & " * " & Y_Column & ") AS AvgXY, VarP(" & X_Column & ") AS VarPX, " & _
"VarP(" & Y_Column & ") AS VarPY " & _
"FROM " & Tbl & " " & _
"WHERE " & IIf(Trim(Criteria) <> "", Criteria & " And ", "") & X_Column & " Is Not Null " & _
"And " & Y_Column & " Is Not Null"
Set rs = CurrentDb.OpenRecordset(SQL, dbOpenSnapshot)
' Transfer values from recordset to variables, then close recordset
AvgX = CDbl(rs!AvgX)
AvgY = CDbl(rs!AvgY)
AvgXY = CDbl(rs!AvgXY)
N = rs!N
VarPX = CDbl(rs!VarPX)
VarPY = CDbl(rs!VarPY)
Covar = AvgXY - AvgX * AvgY
rs.Close
' There must be at least 3 valid data points for regression to work. If there are 2 or
' fewer data points, we skip the rest of the calculations, thus allowing the regression
' stats to remain as null
If N > 2 Then
' Calculate various stats
Result_RSquared = Covar ^ 2 / (VarPX * VarPY)
Result_XCoeff = Covar / VarPX
Result_Intercept = AvgY - AvgX * Result_XCoeff
Result_SE_Resid = ((N / (N - 2)) * (VarPY - Covar ^ 2 / VarPX)) ^ 0.5
Result_SE_XCoeff = Result_SE_Resid * (1 / (N * VarPX)) ^ 0.5
Result_SE_Intercept = Result_SE_Resid * ((VarPX + AvgX ^ 2) / (N * VarPX)) ^ 0.5
Result_T_XCoeff = Result_XCoeff / Result_SE_XCoeff
Result_T_Intercept = Result_Intercept / Result_SE_Intercept
End If
End If
ReturnValue:
' Set the fnction's return value
Select Case Mode
Case SimpleRegressStat_All
Results(1) = Result_RSquared
Results(2) = Result_XCoeff
Results(3) = Result_Intercept
Results(4) = Result_SE_Resid
Results(5) = Result_SE_XCoeff
Results(6) = Result_SE_Intercept
Results(7) = Result_T_XCoeff
Results(8) = Result_T_Intercept
DSimpleRegress = Results
Case SimpleRegressStat_RSquared
DSimpleRegress = Result_RSquared
Case SimpleRegressStat_XCoeff
DSimpleRegress = Result_XCoeff
Case SimpleRegressStat_Intercept
DSimpleRegress = Result_Intercept
Case SimpleRegressStat_SE_Resid
DSimpleRegress = Result_SE_Resid
Case SimpleRegressStat_SE_XCoeff
DSimpleRegress = Result_SE_XCoeff
Case SimpleRegressStat_SE_Intercept
DSimpleRegress = Result_SE_Intercept
Case SimpleRegressStat_T_XCoeff
DSimpleRegress = Result_T_XCoeff
Case SimpleRegressStat_T_Intercept
DSimpleRegress = Result_T_Intercept
End Select
Last_Runtime = Now
GoTo Cleanup
ErrHandler:
DSimpleRegress = CVErr(Err.Number)
Cleanup:
Set rs = Nothing
End Function
Have a question about something in this article? You can receive help directly from the article author. Sign up for a free trial to get started.
Comments (4)
Author
Commented:The SQL examples, Access sample file, and VBA code have all been updated, using closer-to-optimal calculation techniques graciously suggested by harfang.
BTW, one additional calculation that did not make it into the article is how to calculate the slope when the regression line is forced through the origin. The formula for that is:
slope = Sum(X * Y) / Sum(X ^ 2)
In SQL:
Open in new window
Cheers,
Patrick
Commented:
Great article! It covers one topic, completely, and well. The formulas are presented with just the right amount of explanations (and they were correct before the last update, as you well know). I have the feeling I will be glad to copy-and-paste from it for myself sometime soon, or point readers to it.
Markus -- (°v°)
Commented:
Example:
Open in new window
If you like that result, I would suggest playing with doing a similar VarP() and Count(x) expression substitutions like I did with Term1.Open in new window
If you do, please let me know about any performance gains.Commented:
This may not be the place to ask this question, but here goes.
I'm trying to use the vba function in standard vba (rather than part of an sql statement).
It seems to be reading the function and executing, but i keep getting the 3115 error code.
Any recommendations on what i need to define my variable name as (variant?, double?, etc.) to get it to work out?
Any advice is appreciated.
Thanks!