Optimize your dataset memory print with minimal dtype.
# sell_prices.csv.zip
# Source data: https://www.kaggle.com/c/m5-forecasting-uncertainty/
df = pd.read_csv('data/sell_prices.csv')
report_on_dataframe(df)
report_on_dataframe
shows you the possible dtype conversion and the improvement. Note that the library try to optimize the memory base on current values of the data, you should still be careful about overflow for further transformation.
if __name__ == "__main__":
print("Given a dataframe, check for lowest possible conversions:")
nbr_rows = 100
df = pd.DataFrame()
df["a"] = [0] * nbr_rows
df["b"] = [256] * nbr_rows
df["c"] = [65_536] * nbr_rows
df["d"] = [1_100.0] * nbr_rows
df["e"] = [100_101.0] * nbr_rows
df["str_a"] = ["hello"] * nbr_rows
df["str_b"] = [str(n) for n in range(nbr_rows)]
report_on_dataframe(df)
print("convert_dtypes does a slightly different job:")
print(df.convert_dtypes())