Correlation of time based variables in MySQL

I often find it useful to calculate the correlation of two time-based variables directly inside MySQL. For example:-

SELECT calcCorrelation('BHP','RIO','3600');

Where the first two inputs are the variable selections and the third is the time based quantization to use. Since MySQL does not have any built-in corr() function you need to do the heavy lifting yourself with a stored function.

Included below is a function that you can use by simply placing the dataset you want correlated into a (temporary) table named `temp_calc_correlation_data` with columns `variable_name`,`value` and `timestamp`.

DROP TEMPORARY TABLE IF EXISTS `temp_calc_correlation_data`;
CREATE TEMPORARY TABLE `temp_calc_correlation_data` (`variable_name` VARCHAR(32), `value` DOUBLE, `timestamp` DATETIME) ENGINE=MEMORY;

You might therefore find yourself doing something like this to populate data into that table:-

INSERT INTO `temp_calc_correlation_data` (
	SELECT
		`ticker_symbol` AS 'variable_name',
		((`bid`+`offer`)/2) AS 'value',
		`timestamp`
	FROM `market_data` 
	WHERE (2>1)
	AND `timestamp` >= '2010-01-01 00:00:00'
	AND `timestamp` <= '2010-06-01 00:00:00'
	AND (`ticker_symbol` = 'BHP' OR 'RIO') 
	ORDER BY `timestamp`
);

Unfortunately MySQL will not handle dynamic SQL inside a stored function which might otherwise allow us to create a truly portable function. Thus if you want to avoid the process of shoveling data into the `temp_calc_correlation_data` table first you can also adjust the calcCorrelation() function and hard code in the source table and add some datetime inputs to calcCorrelation().

The really great thing about doing all this in the database is that it's fast compared to developing something external that has to connect, slurp, calculate and re-insert a result.

If you can avoid it, try not to implement routines outside the database when they are just data manipulations - I literally have 100's of stored functions and procedures that implement all kinds of stuff from API interfaces to login auth schemes and session data management which is all kept very fast because the data stays in the database until it needs to come out. It's not always possible but it's great if you can do it and find a way to QA it.

Enjoy

# =============================================================================
# http://www.nicholasdejong.com/story/correlation-time-based-variables-mysql
# =============================================================================

DELIMITER $$
DROP FUNCTION IF EXISTS `calcCorrelation`$$
CREATE FUNCTION `calcCorrelation`(
	i_variable_name_1 	VARCHAR(64),
	i_variable_name_2 	VARCHAR(64),
	i_quantization_seconds 	DOUBLE
)
RETURNS VARCHAR(128)
LANGUAGE SQL DETERMINISTIC
BEGIN

DECLARE samp_x	 	DOUBLE		DEFAULT NULL;
DECLARE samp_y	 	DOUBLE		DEFAULT NULL;
DECLARE sum_sq_x 	DOUBLE		DEFAULT NULL;
DECLARE sum_sq_y 	DOUBLE		DEFAULT NULL;
DECLARE sum_coproduct	DOUBLE		DEFAULT NULL;
DECLARE mean_x		DOUBLE		DEFAULT NULL;
DECLARE mean_y		DOUBLE		DEFAULT NULL;
DECLARE sweep		DOUBLE		DEFAULT NULL;
DECLARE delta_x		DOUBLE		DEFAULT NULL;
DECLARE delta_y		DOUBLE		DEFAULT NULL;
DECLARE pop_sd_x	DOUBLE		DEFAULT NULL;
DECLARE pop_sd_y	DOUBLE		DEFAULT NULL;
DECLARE cov_x_y		DOUBLE		DEFAULT NULL;
DECLARE correlation	DOUBLE		DEFAULT NULL;
DECLARE ratio_average	DOUBLE		DEFAULT NULL;
DECLARE ratio_stddev	DOUBLE		DEFAULT NULL;
DECLARE icount		INT		DEFAULT NULL;

# c_samples
# =============================================================================
DECLARE c_samples_done INT DEFAULT 0;
DECLARE c_samples CURSOR FOR (
	SELECT `x`,`y` FROM `samp_xy`
);
DECLARE CONTINUE HANDLER FOR NOT FOUND SET c_samples_done = 1;

# Create temporary tables to inject our values into
DROP TEMPORARY TABLE IF EXISTS `var_x`; CREATE TEMPORARY TABLE `var_x` (`value` DOUBLE, `timestamp_round` INT(10), KEY `timestamp_round`(`timestamp_round`)) ENGINE=MEMORY;
DROP TEMPORARY TABLE IF EXISTS `var_y`; CREATE TEMPORARY TABLE `var_y` (`value` DOUBLE, `timestamp_round` INT(10), KEY `timestamp_round`(`timestamp_round`)) ENGINE=MEMORY;
DROP TEMPORARY TABLE IF EXISTS `samp_xy`; CREATE TEMPORARY TABLE `samp_xy` (`x` DOUBLE, `y` DOUBLE, `unixtime` INT(10)) ENGINE=MEMORY;

# Insert x samples into temp table
INSERT INTO `var_x` (
	SELECT
		AVG(`value`),
		ROUND((UNIX_TIMESTAMP(`timestamp`)/i_quantization_seconds),0)
	FROM `temp_calc_correlation_data`
	WHERE `variable_name` = i_variable_name_1
	GROUP BY ROUND(UNIX_TIMESTAMP(`timestamp`)/i_quantization_seconds,0)
);

# Insert y samples into temp table
INSERT INTO `var_y` (
	SELECT
		AVG(`value`),
		ROUND((UNIX_TIMESTAMP(`timestamp`)/i_quantization_seconds),0)
	FROM `temp_calc_correlation_data`
	WHERE `variable_name` = i_variable_name_2
	GROUP BY ROUND(UNIX_TIMESTAMP(`timestamp`)/i_quantization_seconds,0)
);

# Insert corrosponding samples into a new table
INSERT INTO `samp_xy` (
	SELECT
		`var_x`.`value`		AS 'x',
		`var_y`.`value`		AS 'y',
		`var_x`.`timestamp_round` * i_quantization_seconds
	FROM `var_x`
	JOIN `var_y` ON `var_x`.`timestamp_round` = `var_y`.`timestamp_round`
);

# Calculate ratio_average
# =============================================================================
SET ratio_average = (SELECT AVG(`x`/`y`) FROM `samp_xy`);

# Calculate ratio_stddev
# =============================================================================
SET ratio_stddev = (SELECT STDDEV(`x`/`y`) FROM `samp_xy`);

# Calculate correlation
# =============================================================================

# sum_sq_x = 0
SET sum_sq_x = 0;

# sum_sq_y = 0
SET sum_sq_y = 0;

# sum_coproduct = 0
SET sum_coproduct = 0;

# mean_x = x[1]
SET mean_x = (SELECT `x` FROM `samp_xy` LIMIT 1);

# mean_y = y[1]
SET mean_y = (SELECT `y` FROM `samp_xy` LIMIT 1);

# for i in 2 to N:
# =============================================================================
SET c_samples_done   = '0';
SET icount = '1';
OPEN c_samples;
REPEAT
	FETCH c_samples INTO samp_x,samp_y;
	IF NOT c_samples_done THEN
		IF icount > 1 THEN
			# sweep = (i - 1.0) / i
			SET sweep = ((icount - 1.0) / icount);

			# delta_x = x[i] - mean_x
			SET delta_x = samp_x - mean_x;

			# delta_y = y[i] - mean_y
			SET delta_y = samp_y - mean_y;

			# sum_sq_x += delta_x * delta_x * sweep
			SET sum_sq_x = sum_sq_x + (delta_x * delta_x * sweep);

			# sum_sq_y += delta_y * delta_y * sweep
			SET sum_sq_y = sum_sq_y + (delta_y * delta_y * sweep);

			# sum_coproduct += delta_x * delta_y * sweep
			SET sum_coproduct = sum_coproduct + (delta_x * delta_y * sweep);

			# mean_x += delta_x / i
			SET mean_x = mean_x + (delta_x / icount);

			# mean_y += delta_y / i 
			SET mean_y = mean_y + (delta_y / icount);

		END IF;
		SET icount = icount + 1;
	END IF;
UNTIL c_samples_done END REPEAT;
CLOSE c_samples;

# pop_sd_x = sqrt( sum_sq_x )
SET pop_sd_x = SQRT(sum_sq_x);

# pop_sd_y = sqrt( sum_sq_y )
SET pop_sd_y = SQRT(sum_sq_y);

# cov_x_y = sum_coproduct
SET cov_x_y = sum_coproduct;

# correlation = cov_x_y / (pop_sd_x * pop_sd_y)
SET correlation = cov_x_y / (pop_sd_x * pop_sd_y);

# 
# =============================================================================

# These drop operations can take a LONG time to complete...
#DROP TEMPORARY TABLE IF EXISTS `var_x`;
#DROP TEMPORARY TABLE IF EXISTS `var_y`;
#DROP TEMPORARY TABLE IF EXISTS `samp_xy`;

# Return data
# =============================================================================
RETURN correlation;
##RETURN CONCAT(correlation,"|",icount-1,"|",ratio_average,"|",ratio_stddev);

END$$
DELIMITER ;

Post new comment

The content of this field is kept private and will not be shown publicly.
  • Web page addresses and e-mail addresses turn into links automatically.
  • Allowed HTML tags: <a> <em> <strong> <cite> <code> <ul> <ol> <li> <dl> <dt> <dd>
  • Lines and paragraphs break automatically.

More information about formatting options

CAPTCHA
This question is for testing whether you are a human visitor and to prevent automated spam submissions.
Image CAPTCHA
Enter the characters shown in the image.