I often find it useful to calculate the correlation of two time-based variables directly inside MySQL. For example:-
SELECT calcCorrelation('BHP','RIO','3600');
Where the first two inputs are the variable selections and the third is the time based quantization to use. Since MySQL does not have any built-in corr() function you need to do the heavy lifting yourself with a stored function.
Included below is a function that you can use by simply placing the dataset you want correlated into a (temporary) table named `temp_calc_correlation_data` with columns `variable_name`,`value` and `timestamp`.
DROP TEMPORARY TABLE IF EXISTS `temp_calc_correlation_data`; CREATE TEMPORARY TABLE `temp_calc_correlation_data` (`variable_name` VARCHAR(32), `value` DOUBLE, `timestamp` DATETIME) ENGINE=MEMORY;
You might therefore find yourself doing something like this to populate data into that table:-
INSERT INTO `temp_calc_correlation_data` ( SELECT `ticker_symbol` AS 'variable_name', ((`bid`+`offer`)/2) AS 'value', `timestamp` FROM `market_data` WHERE (2>1) AND `timestamp` >= '2010-01-01 00:00:00' AND `timestamp` <= '2010-06-01 00:00:00' AND (`ticker_symbol` = 'BHP' OR 'RIO') ORDER BY `timestamp` );
Unfortunately MySQL will not handle dynamic SQL inside a stored function which might otherwise allow us to create a truly portable function. Thus if you want to avoid the process of shoveling data into the `temp_calc_correlation_data` table first you can also adjust the calcCorrelation() function and hard code in the source table and add some datetime inputs to calcCorrelation().
The really great thing about doing all this in the database is that it's fast compared to developing something external that has to connect, slurp, calculate and re-insert a result.
If you can avoid it, try not to implement routines outside the database when they are just data manipulations - I literally have 100's of stored functions and procedures that implement all kinds of stuff from API interfaces to login auth schemes and session data management which is all kept very fast because the data stays in the database until it needs to come out. It's not always possible but it's great if you can do it and find a way to QA it.
Enjoy
# ============================================================================= # http://www.nicholasdejong.com/story/correlation-time-based-variables-mysql # ============================================================================= DELIMITER $$ DROP FUNCTION IF EXISTS `calcCorrelation`$$ CREATE FUNCTION `calcCorrelation`( i_variable_name_1 VARCHAR(64), i_variable_name_2 VARCHAR(64), i_quantization_seconds DOUBLE ) RETURNS VARCHAR(128) LANGUAGE SQL DETERMINISTIC BEGIN DECLARE samp_x DOUBLE DEFAULT NULL; DECLARE samp_y DOUBLE DEFAULT NULL; DECLARE sum_sq_x DOUBLE DEFAULT NULL; DECLARE sum_sq_y DOUBLE DEFAULT NULL; DECLARE sum_coproduct DOUBLE DEFAULT NULL; DECLARE mean_x DOUBLE DEFAULT NULL; DECLARE mean_y DOUBLE DEFAULT NULL; DECLARE sweep DOUBLE DEFAULT NULL; DECLARE delta_x DOUBLE DEFAULT NULL; DECLARE delta_y DOUBLE DEFAULT NULL; DECLARE pop_sd_x DOUBLE DEFAULT NULL; DECLARE pop_sd_y DOUBLE DEFAULT NULL; DECLARE cov_x_y DOUBLE DEFAULT NULL; DECLARE correlation DOUBLE DEFAULT NULL; DECLARE ratio_average DOUBLE DEFAULT NULL; DECLARE ratio_stddev DOUBLE DEFAULT NULL; DECLARE icount INT DEFAULT NULL; # c_samples # ============================================================================= DECLARE c_samples_done INT DEFAULT 0; DECLARE c_samples CURSOR FOR ( SELECT `x`,`y` FROM `samp_xy` ); DECLARE CONTINUE HANDLER FOR NOT FOUND SET c_samples_done = 1; # Create temporary tables to inject our values into DROP TEMPORARY TABLE IF EXISTS `var_x`; CREATE TEMPORARY TABLE `var_x` (`value` DOUBLE, `timestamp_round` INT(10), KEY `timestamp_round`(`timestamp_round`)) ENGINE=MEMORY; DROP TEMPORARY TABLE IF EXISTS `var_y`; CREATE TEMPORARY TABLE `var_y` (`value` DOUBLE, `timestamp_round` INT(10), KEY `timestamp_round`(`timestamp_round`)) ENGINE=MEMORY; DROP TEMPORARY TABLE IF EXISTS `samp_xy`; CREATE TEMPORARY TABLE `samp_xy` (`x` DOUBLE, `y` DOUBLE, `unixtime` INT(10)) ENGINE=MEMORY; # Insert x samples into temp table INSERT INTO `var_x` ( SELECT AVG(`value`), ROUND((UNIX_TIMESTAMP(`timestamp`)/i_quantization_seconds),0) FROM `temp_calc_correlation_data` WHERE `variable_name` = i_variable_name_1 GROUP BY ROUND(UNIX_TIMESTAMP(`timestamp`)/i_quantization_seconds,0) ); # Insert y samples into temp table INSERT INTO `var_y` ( SELECT AVG(`value`), ROUND((UNIX_TIMESTAMP(`timestamp`)/i_quantization_seconds),0) FROM `temp_calc_correlation_data` WHERE `variable_name` = i_variable_name_2 GROUP BY ROUND(UNIX_TIMESTAMP(`timestamp`)/i_quantization_seconds,0) ); # Insert corrosponding samples into a new table INSERT INTO `samp_xy` ( SELECT `var_x`.`value` AS 'x', `var_y`.`value` AS 'y', `var_x`.`timestamp_round` * i_quantization_seconds FROM `var_x` JOIN `var_y` ON `var_x`.`timestamp_round` = `var_y`.`timestamp_round` ); # Calculate ratio_average # ============================================================================= SET ratio_average = (SELECT AVG(`x`/`y`) FROM `samp_xy`); # Calculate ratio_stddev # ============================================================================= SET ratio_stddev = (SELECT STDDEV(`x`/`y`) FROM `samp_xy`); # Calculate correlation # ============================================================================= # sum_sq_x = 0 SET sum_sq_x = 0; # sum_sq_y = 0 SET sum_sq_y = 0; # sum_coproduct = 0 SET sum_coproduct = 0; # mean_x = x[1] SET mean_x = (SELECT `x` FROM `samp_xy` LIMIT 1); # mean_y = y[1] SET mean_y = (SELECT `y` FROM `samp_xy` LIMIT 1); # for i in 2 to N: # ============================================================================= SET c_samples_done = '0'; SET icount = '1'; OPEN c_samples; REPEAT FETCH c_samples INTO samp_x,samp_y; IF NOT c_samples_done THEN IF icount > 1 THEN # sweep = (i - 1.0) / i SET sweep = ((icount - 1.0) / icount); # delta_x = x[i] - mean_x SET delta_x = samp_x - mean_x; # delta_y = y[i] - mean_y SET delta_y = samp_y - mean_y; # sum_sq_x += delta_x * delta_x * sweep SET sum_sq_x = sum_sq_x + (delta_x * delta_x * sweep); # sum_sq_y += delta_y * delta_y * sweep SET sum_sq_y = sum_sq_y + (delta_y * delta_y * sweep); # sum_coproduct += delta_x * delta_y * sweep SET sum_coproduct = sum_coproduct + (delta_x * delta_y * sweep); # mean_x += delta_x / i SET mean_x = mean_x + (delta_x / icount); # mean_y += delta_y / i SET mean_y = mean_y + (delta_y / icount); END IF; SET icount = icount + 1; END IF; UNTIL c_samples_done END REPEAT; CLOSE c_samples; # pop_sd_x = sqrt( sum_sq_x ) SET pop_sd_x = SQRT(sum_sq_x); # pop_sd_y = sqrt( sum_sq_y ) SET pop_sd_y = SQRT(sum_sq_y); # cov_x_y = sum_coproduct SET cov_x_y = sum_coproduct; # correlation = cov_x_y / (pop_sd_x * pop_sd_y) SET correlation = cov_x_y / (pop_sd_x * pop_sd_y); # # ============================================================================= # These drop operations can take a LONG time to complete... #DROP TEMPORARY TABLE IF EXISTS `var_x`; #DROP TEMPORARY TABLE IF EXISTS `var_y`; #DROP TEMPORARY TABLE IF EXISTS `samp_xy`; # Return data # ============================================================================= RETURN correlation; ##RETURN CONCAT(correlation,"|",icount-1,"|",ratio_average,"|",ratio_stddev); END$$ DELIMITER ;

Post new comment