import pandas as pd
import datetime as dt
import dask
import numpy as np
import dask.dataframe as dd

# Define datetime range - 5 months
ts=pd.date_range("2015-01-01 00:00", " 2015-05-01 23:50", freq="10min")
# Create a DataFrame with four columns based on a range between 0, 100, indexed by time
df = pd.DataFrame(np.random.randint(0,100,size=(len(ts),4)), columns=list('ABCD'))
df.head()

# Partition DataFrame by fixed number
ddf = dd.from_pandas(df, npartitions=2)
ddf

print(ddf.divisions)

(0, 8712, 17423)

# Create a DataFrame with four columns based on a range between 0, 100, indexed by time
df = pd.DataFrame(np.random.randint(0,100,size=(len(ts),4)), columns=list('ABCD'), index=ts)
df.head()

# Partition DataFrame by fixed number
ddf = dd.from_pandas(df, npartitions=2)
ddf

ddf_time = ddf.repartition(freq='MS') # Month start

# We can check the number of partitons
ddf_time.npartitions

5

# We can check whether the partitions are divided
ddf_time.divisions

(Timestamp('2015-01-01 00:00:00'),
 Timestamp('2015-02-01 00:00:00'),
 Timestamp('2015-03-01 00:00:00'),
 Timestamp('2015-04-01 00:00:00'),
 Timestamp('2015-05-01 00:00:00'),
 Timestamp('2015-05-01 23:50:00'))

def my_user_function1(x):
    """Function to transform numerical column into categorical based on if/elif statements"""
    if 0 < x <= 10:
        return 'x'
    elif 10 < x <= 25:
        return 'xx'
    elif 25 < x <= 50:
        return 'xxx'
    # else
    return 'xxxx'

# # Create a DataFrame with four columns based on a range between 0, 100, indexed by time
df = pd.DataFrame(np.random.randint(0,100,size=(len(ts),4)), columns=list('ABCD'))
# Apply function to create new column
df['my_cat_col'] = df['A'].apply(my_user_function1)
df.head()

df['my_cat_col'].value_counts()

my_cat_col
xxxx    8787
xxx     4287
xx      2634
x       1716
Name: count, dtype: int64

# Sort and set our index as our categorical column
df = df.sort_values('my_cat_col').set_index('my_cat_col', drop=False)

cat_len = len(df['my_cat_col'].unique())
cat_len

4

ddf = dd.from_pandas(df, npartitions=cat_len)
ddf

print(ddf.npartitions)
print(ddf.divisions)

4
('x', 'xx', 'xxx', 'xxxx', 'xxxx')

ddf.map_partitions(len).compute()

0    1716
1    2634
2    4287
3    8787
dtype: int64

# Define a function
def my_user_function2(col1, col2):
    """A function that multiplies column 1 by column 2"""
    return col1 * col2

# Apply function lazily to two DataFrame cols
ddf['result'] = dd.map_partitions(my_user_function2, ddf['A'], ddf['B'])

ddf.compute()

	A	B	C	D
0	30	4	12	88
1	20	0	83	35
2	53	40	15	45
3	84	25	40	84
4	87	45	14	27

	A	B	C	D
npartitions=2
0	int64	int64	int64	int64
8712	...	...	...	...
17423	...	...	...	...

	A	B	C	D
2015-01-01 00:00:00	43	74	0	70
2015-01-01 00:10:00	71	31	95	92
2015-01-01 00:20:00	72	76	20	74
2015-01-01 00:30:00	11	3	24	44
2015-01-01 00:40:00	47	79	25	30

	A	B	C	D
npartitions=2
2015-01-01 00:00:00	int64	int64	int64	int64
2015-03-02 12:00:00	...	...	...	...
2015-05-01 23:50:00	...	...	...	...

	A	B	C	D	my_cat_col
0	36	37	29	61	xxx
1	94	19	44	69	xxxx
2	28	3	35	43	xxx
3	86	26	92	2	xxxx
4	84	62	63	5	xxxx

GSoC: Partitioning a Dask-DataFrame

June 12, 2021 GSoc Partitioning

Dask DataFrame structure¶

1. Partitioning a DataFrame using a fixed number of partitions¶

2. Partition based on time¶

3. Partition based on a column¶

4. Partition data based on a function¶

Conclusion¶

	A	B	C	D	my_cat_col
npartitions=4
x	int64	int64	int64	int64	object
xx	...	...	...	...	...
xxx	...	...	...	...	...
xxxx	...	...	...	...	...
xxxx	...	...	...	...	...