Commit 43e461aa authored by mhuels's avatar mhuels
Browse files

version 0.6

improved performance of file-reading operations (especially for large ensemble sizes) (needs pandas.read_csv())
parent 09cec73e
......@@ -7,6 +7,7 @@ from analyzebd.units.unit import Unit
from copy import deepcopy
import warnings
from analyzebd.helpers.file_reader import find_setting_in_file
import pandas as pd
class Quantity:
def __init__(self, **kwargs):
......@@ -127,14 +128,24 @@ class Quantity:
self.iStart = iStart
self.iEnd = iEnd
if self.numberOfEnsembles == 1:
if self.numberOfEnsembles is None:
self.setArray(self.__readFromFileSingle(self.iStart, self.get_iLength(), columns=columns))
else:
self.setArray(self.__readFromFileEnsembles(self.numberOfEnsembles, columns=columns))
return self
def __readFromFileSingle(self, iStart, iLength, columns=0):
array = np.loadtxt(skipper(self.src), skiprows=iStart, max_rows=iLength, usecols=columns)
# deprecated: scales very badly for larger ensemble sizes (>500)
# def __readFromFileSingle(self, iStart, iLength, columns=0):
# array = np.loadtxt(skipper(self.src), skiprows=iStart, max_rows=iLength, usecols=columns)
# return array
def __readFromFileSingle(self, iStart, iLength, columns=[0]):
if isinstance(columns, int):
columns = [columns]
df = pd.read_csv(self.src, skiprows=iStart, nrows=iLength, usecols=columns, comment='#', sep='\t',
header=None)
array = df.to_numpy()
array = np.squeeze(array)
return array
def __readFromFileEnsembles(self, numberOfEnsembles, columns=0):
......@@ -149,16 +160,26 @@ class Quantity:
numberOfColumns = len(columns)
else:
raise TypeError("Invalid type for columns. Use either int or list of int.")
arrays = []
for i in range(numberOfEnsembles):
iStart = self.iStart + i * numberOfTimesteps
arrays.append(self.__readFromFileSingle(iStart, max_rows, columns))
array = np.stack(arrays)
# read all at once and reshape (always the preferred way, at least with pd.read_csv())
array = self.__readFromFileSingle(0, numberOfEnsembles * numberOfTimesteps, columns=columns)
array = np.reshape(array, (numberOfEnsembles, numberOfTimesteps, numberOfColumns))
array = array[:, self.iStart:max_rows]
array = np.squeeze(array)
# read one at a time (with correct length) and stack afterwards (slow!)
# arrays = []
# for i in range(numberOfEnsembles):
# iStart = self.iStart + i * numberOfTimesteps
# arrays.append(self.__readFromFileSingle(iStart, max_rows, columns))
# array = np.stack(arrays)
# array = np.squeeze(array)
return array
def __readNumberOfEnsembles(self):
"""
Needs self.src to be set.
:return: N (int) or None (if only single file)
"""
N = find_setting_in_file(self.src, "numberOfEnsembles", dtype=int)
if N is None:
......@@ -166,7 +187,7 @@ class Quantity:
data = f.read()
N = data.count("Ensemble")
if N == 0:
N = 1
N = None
return N
def __readNumberOfTimesteps(self):
......
from setuptools import setup, find_packages
setup(name='analyzebd',
version='0.5',
version='0.6',
packages=find_packages(),
install_requires=['matplotlib', 'pathlib', 'scipy'])
install_requires=['matplotlib', 'pathlib', 'scipy', 'pandas'])
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment