portable - Desviación estándar para SQLite
sqlite studio (8)
Busqué en los documentos de SQLite y no pude encontrar nada, pero también busqué en Google y aparecieron algunos resultados.
¿SQLite tiene alguna función integrada de Desviación Estándar?
Agregó un poco de detección de errores en las funciones de Python
class StdevFunc:
"""
For use as an aggregate function in SQLite
"""
def __init__(self):
self.M = 0.0
self.S = 0.0
self.k = 0
def step(self, value):
try:
# automatically convert text to float, like the rest of SQLite
val = float(value) # if fails, skips this iteration, which also ignores nulls
tM = self.M
self.k += 1
self.M += ((val - tM) / self.k)
self.S += ((val - tM) * (val - self.M))
except:
pass
def finalize(self):
if self.k <= 1: # avoid division by zero
return none
else:
return math.sqrt(self.S / (self.k-1))
Implementé el método de Welford (igual que extension-functions.c
) como un UDF de SQLite:
$db->sqliteCreateAggregate(''stdev'',
function (&$context, $row, $data) // step callback
{
if (isset($context) !== true) // $context is null at first
{
$context = array
(
''k'' => 0,
''m'' => 0,
''s'' => 0,
);
}
if (isset($data) === true) // the standard is non-NULL values only
{
$context[''s''] += ($data - $context[''m'']) * ($data - ($context[''m''] += ($data - $context[''m'']) / ++$context[''k'']));
}
return $context;
},
function (&$context, $row) // fini callback
{
if ($context[''k''] > 0) // return NULL if no non-NULL values exist
{
return sqrt($context[''s''] / $context[''k'']);
}
return null;
},
1);
Eso está en PHP ( $db
es el objeto PDO) pero debería ser trivial para portar a otro idioma.
SQLite es tan genial <3
Las funciones agregadas soportadas por SQLite están aquí:
http://www.sqlite.org/lang_aggfunc.html
STDEV no está en la lista.
Sin embargo, el módulo extension-functions.c
en esta página contiene una función STDEV.
No, busqué en este mismo problema y terminé de hacer los cálculos con mi aplicación (PHP)
Puedes calcular la varianza en SQL:
create table t (row int);
insert into t values (1),(2),(3);
SELECT AVG((t.row - sub.a) * (t.row - sub.a)) as var from t,
(SELECT AVG(row) AS a FROM t) AS sub;
0.666666666666667
Sin embargo, todavía tiene que calcular la raíz cuadrada para obtener la desviación estándar.
Todavía no hay una función stdev incorporada en sqlite. Sin embargo, puede definir (como lo ha hecho Alix) una función agregadora definida por el usuario. Aquí hay un ejemplo completo en Python:
import sqlite3
import math
class StdevFunc:
def __init__(self):
self.M = 0.0
self.S = 0.0
self.k = 1
def step(self, value):
if value is None:
return
tM = self.M
self.M += (value - tM) / self.k
self.S += (value - tM) * (value - self.M)
self.k += 1
def finalize(self):
if self.k < 3:
return None
return math.sqrt(self.S / (self.k-2))
with sqlite3.connect('':memory:'') as con:
con.create_aggregate("stdev", 1, StdevFunc)
cur = con.cursor()
cur.execute("create table test(i)")
cur.executemany("insert into test(i) values (?)", [(1,), (2,), (3,), (4,), (5,)])
cur.execute("insert into test(i) values (null)")
cur.execute("select avg(i) from test")
print("avg: %f" % cur.fetchone()[0])
cur.execute("select stdev(i) from test")
print("stdev: %f" % cur.fetchone()[0])
Esto imprimirá:
avg: 3.000000
stdev: 1.581139
Compare con MySQL: http://sqlfiddle.com/#!2/ad42f3/3/0
un pequeño truco
select ((sum(value)*sum(value) - sum(value * value))/((count(*)-1)*(count(*))))
from the_table ;
entonces lo único que queda es calcular sqrt fuera.
#!/usr/bin/python
# -*- coding: utf-8 -*-
#Values produced by this script can be verified by follwing the steps
#found at https://support.microsoft.com/en-us/kb/213930 to Verify
#by chosing a non memory based database.
import sqlite3
import math
import random
import os
import sys
import traceback
import random
class StdevFunc:
def __init__(self):
self.M = 0.0 #Mean
self.V = 0.0 #Used to Calculate Variance
self.S = 0.0 #Standard Deviation
self.k = 1 #Population or Small
def step(self, value):
try:
if value is None:
return None
tM = self.M
self.M += (value - tM) / self.k
self.V += (value - tM) * (value - self.M)
self.k += 1
except Exception as EXStep:
pass
return None
def finalize(self):
try:
if ((self.k - 1) < 3):
return None
#Now with our range Calculated, and Multiplied finish the Variance Calculation
self.V = (self.V / (self.k-2))
#Standard Deviation is the Square Root of Variance
self.S = math.sqrt(self.V)
return self.S
except Exception as EXFinal:
pass
return None
def Histogram(Population):
try:
BinCount = 6
More = 0
#a = 1 #For testing Trapping
#b = 0 #and Trace Back
#c = (a / b) #with Detailed Info
#If you want to store the Database
#uncDatabase = os.path.join(os.getcwd(),"BellCurve.db3")
#con = sqlite3.connect(uncDatabase)
#If you want the database in Memory
con = sqlite3.connect('':memory:'')
#row_factory allows accessing fields by Row and Col Name
con.row_factory = sqlite3.Row
#Add our Non Persistent, Runtime Standard Deviation Function to the Database
con.create_aggregate("Stdev", 1, StdevFunc)
#Lets Grab a Cursor
cur = con.cursor()
#Lets Initialize some tables, so each run with be clear of previous run
cur.executescript(''drop table if exists MyData;'') #executescript requires ; at the end of the string
cur.execute("create table IF NOT EXISTS MyData(''ID'' INTEGER PRIMARY KEY AUTOINCREMENT, ''Val'' FLOAT)")
cur.executescript(''drop table if exists Bins;'') #executescript requires ; at the end of the string
cur.execute("create table IF NOT EXISTS Bins(''ID'' INTEGER PRIMARY KEY AUTOINCREMENT, ''Bin'' UNSIGNED INTEGER, ''Val'' FLOAT, ''Frequency'' UNSIGNED BIG INT)")
#Lets generate some random data, and insert in to the Database
for n in range(0,(Population)):
sql = "insert into MyData(Val) values ({0})".format(random.uniform(-1,1))
#If Whole Number Integer greater that value of 2, Range Greater that 1.5
#sql = "insert into MyData(Val) values ({0})".format(random.randint(-1,1))
cur.execute(sql)
pass
#Now let’s calculate some built in Aggregates, that SQLite comes with
cur.execute("select Avg(Val) from MyData")
Average = cur.fetchone()[0]
cur.execute("select Max(Val) from MyData")
Max = cur.fetchone()[0]
cur.execute("select Min(Val) from MyData")
Min = cur.fetchone()[0]
cur.execute("select Count(Val) from MyData")
Records = cur.fetchone()[0]
#Now let’s get Standard Deviation using our function that we added
cur.execute("select Stdev(Val) from MyData")
Stdev = cur.fetchone()[0]
#And Calculate Range
Range = float(abs(float(Max)-float(Min)))
if (Stdev == None):
print("================================ Data Error ===============================")
print(" Insufficient Population Size, Or Bad Data.")
print("*****************************************************************************")
elif (abs(Max-Min) == 0):
print("================================ Data Error ===============================")
print(" The entire Population Contains Identical values, Distribution Incalculable.")
print("******************************************************************************")
else:
Bin = [] #Holds the Bin Values
Frequency = [] #Holds the Bin Frequency for each Bin
#Establish the 1st Bin, which is based on (Standard Deviation * 3) being subtracted from the Mean
Bin.append(float((Average - ((3 * Stdev)))))
Frequency.append(0)
#Establish the remaining Bins, which is basically adding 1 Standard Deviation
#for each interation, -3, -2, -1, 1, 2, 3
for b in range(0,(BinCount) + 1):
Bin.append((float(Bin[(b)]) + Stdev))
Frequency.append(0)
for b in range(0,(BinCount) + 1):
#Lets exploit the Database and have it do the hard work calculating distribution
#of all the Bins, with SQL''s between operator, but making it left inclusive, right exclusive.
sqlBinFreq = "select count(*) as Frequency from MyData where val between {0} and {1} and Val < {2}". /
format(float((Bin[b])), float(Bin[(b + 1)]), float(Bin[(b + 1)]))
#If the Database Reports Values that fall between the Current Bin, Store the Frequency to a Bins Table.
for rowBinFreq in cur.execute(sqlBinFreq):
Frequency[(b + 1)] = rowBinFreq[''Frequency'']
sqlBinFreqInsert = "insert into Bins (Bin, Val, Frequency) values ({0}, {1}, {2})". /
format(b, float(Bin[b]), Frequency[(b)])
cur.execute(sqlBinFreqInsert)
#Allthough this Demo is not likley produce values that
#fall outside of Standard Distribution
#if this demo was to Calculate with real data, we want to know
#how many non-Standard data points we have.
More = (More + Frequency[b])
More = abs((Records - More))
#Add the More value
sqlBinFreqInsert = "insert into Bins (Bin, Val, Frequency) values ({0}, {1}, {2})". /
format((BinCount + 1), float(0), More)
cur.execute(sqlBinFreqInsert)
#Now Report the Analysis
print("================================ The Population ==============================")
print(" {0} {1} {2} {3} {4} {5}". /
format("Size".rjust(10, '' ''), /
"Max".rjust(10, '' ''), /
"Min".rjust(10, '' ''), /
"Mean".rjust(10, '' ''), /
"Range".rjust(10, '' ''), /
"Stdev".rjust(10, '' '')))
print("Aggregates: {0:10d} {1:10.4f} {2:10.4f} {3:10.4f} {4:10.4f} {5:10.4f}". /
format(Population, Max, Min, Average, Range, Stdev))
print("================================= The Bell Curve =============================")
LabelString = "{0} {1} {2} {3}". /
format("Bin".ljust(8, '' ''), /
"Ranges".rjust(8, '' ''), /
"Frequency".rjust(8, '' ''), /
"Histogram".rjust(6, '' ''))
print(LabelString)
print("------------------------------------------------------------------------------")
#Let''s Paint a Histogram
sqlChart = "select * from Bins order by Bin asc"
for rowChart in cur.execute(sqlChart):
if (rowChart[''Bin''] == 7):
#Bin 7 is not really a bin, but where we place the values that did not fit into the
#Normal Distribution. This script was tested against Excel''s Bell Curve Example
#https://support.microsoft.com/en-us/kb/213930
#and produces the same results. Feel free to test it.
BinName = "More"
ChartString = "{0:<6} {1:<10} {2:10.0f}". /
format(BinName, /
"", /
More)
else:
#Theses are the actual bins where values fall within the distribution.
BinName = (rowChart[''Bin''] + 1)
#Scale the Chart
fPercent = ((float(rowChart[''Frequency'']) / float(Records) * 100))
iPrecent = int(math.ceil(fPercent))
ChartString = "{0:<6} {1:10.4f} {2:10.0f} {3}". /
format(BinName, /
rowChart[''Val''], /
rowChart[''Frequency''], /
"".rjust(iPrecent, ''#''))
print(ChartString)
print("******************************************************************************")
#Commit to Database
con.commit()
#Clean Up
cur.close()
con.close()
except Exception as EXBellCurve:
pass
TraceInfo = traceback.format_exc()
raise Exception(TraceInfo)