RE: Python Aggregate functions

6 Jul 2016

      The data:

CREATE FUNCTION random_integers(low INTEGER, high INTEGER, amount INTEGER) 
RETURNS TABLE(fctr INTEGER, v1 INTEGER, v2 INTEGER, v3 INTEGER) 
LANGUAGE PYTHON {
    result = dict()
    result['fctr'] = numpy.random.randint(1, 100, size=(amount,))
    result['v1'] = numpy.random.randint(low, high, size=(amount,))
    result['v2'] = numpy.random.randint(low, high, size=(amount,))
    result['v3'] = numpy.random.randint(low, high, size=(amount,))
    return result
};

create table mini(fctr integer, v1 integer, v2 integer, v3 integer);

insert into mini select * from random_integers(1,10,100000);

update mini set v2 = 1;

-----Original Message-----
From: Anderson, David B [ICG-MKTS] 
Sent: Wednesday, July 06, 2016 3:49 PM
To: users-list
Subject: Python Aggregate functions

Mark,

I think there is a bug in the implementation of the PYTHON_MAP aggregate functions. 

The return value of a PYTHON_MAP aggregate function can change if there are other functions in the query.

The return value of my function weighted_percentile_0 (definition below) changes if I include the function median() in my query. The results are incorrect if median() isn't in the query, but are correct if median() is in the query. In this test case, the weights (v2) are all 1 and the values (v1) are uniformly sampled integers from [1,10]. Number of rows for each fctr is ~1000, and my function should reduce to min(v1) in this case. 

Is some information bleeding between threads? Should I submit a bug report? This is on the default branch.

Thanks,
Dave

sql>select fctr,weighted_percentile_0(v1,v2),min(v1) from mini where 
sql>fctr < 10 group by fctr order by fctr;
+------+--------------------------+------+
| fctr | L1                       | L2   |
+======+==========================+======+
|    1 |                        3 |    1 |
|    2 |                        2 |    1 |
|    3 |                        2 |    1 |
|    4 |                        1 |    1 |
|    5 |                        3 |    1 |
|    6 |                        3 |    1 |
|    7 |                        2 |    1 |
|    8 |                        1 |    1 |
|    9 |                        2 |    1 |
+------+--------------------------+------+
9 tuples (126.928ms)
sql>select fctr,weighted_percentile_0(v1,v2),min(v1),median(v1) from 
sql>mini where fctr < 10 group by fctr order by fctr;
+------+--------------------------+------+------+
| fctr | L1                       | L2   | L3   |
+======+==========================+======+======+
|    1 |                        1 |    1 |    5 |
|    2 |                        1 |    1 |    5 |
|    3 |                        1 |    1 |    5 |
|    4 |                        1 |    1 |    5 |
|    5 |                        1 |    1 |    5 |
|    6 |                        1 |    1 |    5 |
|    7 |                        1 |    1 |    5 |
|    8 |                        1 |    1 |    5 |
|    9 |                        1 |    1 |    5 |
+------+--------------------------+------+------+
9 tuples (519.195ms)
sql>

CREATE AGGREGATE weighted_percentile_0(a DOUBLE, w DOUBLE) RETURNS DOUBLE LANGUAGE PYTHON_MAP {

    import numpy as np      

    # Standardize and sort based on values in a
    q = np.array([0]) / 100.0

    idx = np.argsort(a)

    a_sort = a[idx]
    w_sort = w[idx]

    # Get the cumulative sum of weights
    ecdf = np.cumsum(w_sort)

    # Find the percentile index positions associated with the percentiles
    p = q * (w_sort.sum() - 1)

    # Find the bounding indices (both low and high)
    idx_low = np.searchsorted(ecdf, p, side='right')
    idx_high = np.searchsorted(ecdf, p + 1, side='right')
    idx_high[idx_high > ecdf.size - 1] = ecdf.size - 1

    # Calculate the weights 
    weights_high = p - np.floor(p)
    weights_low = 1.0 - weights_high

    # Extract the low/high indexes and multiply by the corresponding weights
    x1 = np.take(a_sort, idx_low) * weights_low
    x2 = np.take(a_sort, idx_high) * weights_high

    wp = np.add(x1,x2)

    return(wp[0])
};