#!/usr/bin/python # -*- coding: utf-8 -*- import sqlite3 import math import random import os import sys import traceback class StdevFunc: def __init__(self): self.M = 0.0 #Mean self.V = 0.0 #Used to Calculate Variance self.S = 0.0 #Standard Deviation self.k = 1 #Population or Small def step(self, value): try: if value is None: return None tM = self.M self.M += (value - tM) / self.k self.V += (value - tM) * (value - self.M) self.k += 1 except Exception as EXStep: pass return None def finalize(self): try: if ((self.k - 1) < 3): return None #Now with our range Calculated, and Multiplied finish the Variance Calculation if (self.k > 40): self.V = (self.V / (self.k-2)) else: #For Small Populations self.V = (self.V / (self.k-1)) #Standard Deviation is the Square Root of Variance self.S = math.sqrt(self.V) return self.S except Exception as EXFinal: pass return None def Histogram(Population): try: BinCount = 6 More = 0 #a = 1 #For testing Trapping #b = 0 #and Trace Back #c = (a / b) #with Detailed Info #If you want to store the Database #uncDatabase = os.path.join(os.getcwd(),"BellCurve.db3") #con = sqlite3.connect(uncDatabase) #If you want the database in Memory con = sqlite3.connect(':memory:') #row_factory that allows accessing fields by Row and Col Name con.row_factory = sqlite3.Row #Add our Non Persistent, Runtime Standard Deviation Function to the Database con.create_aggregate("Stdev", 1, StdevFunc) #Lets Grab a Cursor cur = con.cursor() #Lets Initialize some tables, so each run with be clear of previous run cur.executescript('drop table if exists MyData;') #executescript requires ; at the end of the string cur.execute("create table IF NOT EXISTS MyData('Val' FLOAT)") cur.executescript('drop table if exists Bins;') #executescript requires ; at the end of the string cur.execute("create table IF NOT EXISTS Bins('Bin' UNSIGNED BIG INT, 'Val' FLOAT, 'Frequency' UNSIGNED BIG INT)") #Lets generate some random data, and insert in to the Database for n in range(0,(Population)): sql = "insert into MyData(Val) values ({0})".format(random.uniform(-1,1)) cur.execute(sql) #Now let’s calculate some built in Aggregates, that SQLite comes with cur.execute("select Avg(Val) from MyData") Average = cur.fetchone()[0] cur.execute("select Max(Val) from MyData") Max = cur.fetchone()[0] cur.execute("select Min(Val) from MyData") Min = cur.fetchone()[0] cur.execute("select Count(Val) from MyData") Records = cur.fetchone()[0] #Now let’s get Standard Deviation using our function that we added cur.execute("select Stdev(Val) from MyData") Stdev = cur.fetchone()[0] if (Stdev == None): print("================================ Data Error --=================================") print(" Insufficient Population Size, Or Bad Data.") print("**********************************************************************************") elif (abs(Max-Min) == 0): print("================================ Data Error --=================================") print(" The entire Population Contains Identical values, Distribution Incalculable..") print("**********************************************************************************") else: Bin = [] #Holds the Bin Values Frequency = [] #Holds the Bin Frequency for each Bin #Establish the 1st Bin, which is based on (Standard Deviation * 3) being subtracted from the Mean LowBin = (Average - ((3 * Stdev))) Bin.append(LowBin) #We start the Index with 1 because we already added the LowBin and as such increased the Bin Count is 1 for b in range(0,(BinCount)): #Establish the remaining Bins, which is basically adding 1 Standard Deviation for each interation, -3, -2, -1, 1, 2, 3 Bin.append((Bin[(b)] + Stdev)) Frequency.append(0) for b in range(0,(BinCount)): #With our Bins Calculated, lets exploit the Database and have it do the hard work calculating distribution #of all the Bins, with SQL's between operator. sqlBinFreq = "select count(*) as Frequency from MyData where val between {0} and {1}".format(Bin[b], Bin[(b + 1)]) #If the Database Reports Values that fall between the Current Bin, Store the Frequency to a Bins Table. for rowBinFreq in cur.execute(sqlBinFreq): Frequency[b] = rowBinFreq['Frequency'] sqlBinFreqInsert = "insert into Bins (Bin, Val, Frequency) values ({0}, {1}, {2})".format(b, Bin[b], Frequency[b]) cur.execute(sqlBinFreqInsert) #Allthough the Demo is not likley produce values that fall outside of Standard Distribution #if this demo was to Calculate with real data, we want to know how many non-Standard data points we have. More = (More + Frequency[b]) #Now Report the Analysis print("================================ The Population ==================================") print(" {0} {1} {2} {3} {4} {5}".format("Size".rjust(10, ' '), "Max".rjust(10, ' '), "Min".rjust(10, ' '), "Mean".rjust(10, ' '), "Range".rjust(10, ' '), "Stdev".rjust(10, ' '))) print("Aggregates: {0:10d} {1:10.4f} {2:10.4f} {3:10.4f} {4:10.4f} {5:10.4f}".format(Population, Max, Min, Average, abs(Max-Min), Stdev)) print("================================ The Distribution ================================") print("Bins: {0:10.0f} {1:10.0f} {2:10.0f} {3:10.0f} {4:10.0f} {5:10.0f}".format(1, 2, 3, 4, 5, 6)) print("----------------------------------------------------------------------------9-----") print("Ranges: {0:10.4f} {1:10.4f} {2:10.4f} {3:10.4f} {4:10.4f} {5:10.4f}".format(Bin[0], Bin[1], Bin[2], Bin[3], Bin[4], Bin[5])) print("Frequency: {0:10.0f} {1:10.0f} {2:10.0f} {3:10.0f} {4:10.0f} {5:10.0f}".format(Frequency[0], Frequency[1], Frequency[2], Frequency[3], Frequency[4], Frequency[5])) print("----------------------------------------------------------------------------------") print("More: {0:10.0f}".format(abs((More) - Records))) print("Bined: {0:10.0f}".format(abs((More)))) print("================================= The Histogram ==================================") LabelString = "{0} {1} {2} {3}".format("Bin".rjust(4, ' '), "Ranges".rjust(10, ' '), "Frequency".rjust(10, ' '), "Histogram".rjust(10, ' ')) sqlChart = "select * from Bins order by Bin asc" print(LabelString) print("----------------------------------------------------------------------------------") #Let's Paint a Histogram for rowChart in cur.execute(sqlChart): fPercent = ((float(rowChart['Frequency']) / float(More) * 100)) iPrecent = int(math.ceil(fPercent)) ChartString = "{0:4.0f} {1:10.4f} {2:10.0f} {3}".format((rowChart['Bin'] + 1), rowChart['Val'], rowChart['Frequency'], "".rjust(iPrecent, '#')) print(ChartString) print("**********************************************************************************") #Clean Up con.commit() cur.close() con.close() except Exception as EXBellCurve: pass TraceInfo = traceback.format_exc() raise Exception(TraceInfo) print("*********************************************************************************") print("Using SQLite, leverage the create_aggregate(), and SQL's Between Operator to") print("calculate distribution, then Demonstrate and Store the results in a Database.\n") print("This demo creates a Population of values between -1 and 1 generated with a") print("Pseudo Random Number Generator. The Mean value should be very close to zero,") print("with a Range value of about 2, contained within a Standard Distribution.") PythonVersion = sys.version_info[0] #Population = 2 #To Test Insufficient Population Size #Population = 3 #To Test Minimum Population Size #Population = 8 #Population of 8 #Population = 39 #Test Small Population #Population = 40 #Test Large Population Population = (16 ** 2) #Population of 256 #Population = (64 ** 2) #Population of 4096 #Population = (128 ** 2) #Population of 16384 #Population = (256 ** 2) #Population of 65536 #Population = (1024 ** 2) #Population of 1048576 Histogram(Population) #Do a Distribution Analysis #Want to see the affect of Population size on distribution? Try this loop #and see how the distribution becomes unreliable with small populations #for i in range(15, ((16 ** 2)), 64): # Histogram(i) if (PythonVersion == 3): kb = input("Press Enter to continue...") else: kb = raw_input("Press Enter to continue...")