# -*- coding: utf-8 -*-
"""
This module provides the base classes for data readers, such as
`XLRD <https://pypi.python.org/pypi/xlrd/0.9.2>`_ and :func:`numpy.loadtxt`,
which are used to read in data sources.
"""
from __future__ import (
absolute_import, division, print_function, unicode_literals)
from io import StringIO
from simkit.core import UREG, Q_
from simkit.core.exceptions import (
UnnamedDataError, MixedTextNoMatchError
)
from xlrd import open_workbook
import csv
import numpy as np
import json
import os
import time
import re
# regex pattern for %e, %E, %f and %g
# http://docs.python.org/2/library/re.html#simulating-scanf
# use (?...) for non capturing groups
EFG_PATTERN = '([-+]?(?:\\d+(?:\\.\\d*)?|\\.\\d+)(?:[eE][-+]?\\d+)?)'
# whitelist regex methods
RE_METH = ['search', 'match', 'findall', 'split']
[docs]class DataReader(object):
"""
Required interface for all SimKit data readers.
:param parameters: parameters to be read
:type parameters: dict
"""
#: True if reader accepts ``filename`` argument
is_file_reader = True # overload in subclasses
def __init__(self, parameters, meta=None):
#: parameters to be read by reader
self.parameters = parameters
#: meta if any
self.meta = meta
[docs] def load_data(self, *args, **kwargs):
"""
Load data from source using reader. This method must be implemented by
each data reader.
:param args: positional arguments
:param kwargs: keyword arguments
:returns: data read by :class:`DataReader`
:rtype: dict
:raises: :exc:`~exceptions.NotImplementedError`
"""
raise NotImplementedError('load_data')
[docs] def apply_units_to_cache(self, data):
"""
Apply units to cached data. This method must be implemented by each data
reader.
:param data: cached data
:return: data with units applied
:rtype: :class:`~pint.unit.Quantity`
:raises: :exc:`~exceptions.NotImplementedError`
"""
raise NotImplementedError('apply_units_to_cache')
[docs]class JSONReader(DataReader):
"""
Read data from a JSON file.
:param parameters: parameters to read
:type parameters: dict
:param data_reader: original :class:`DataReader` if data cached as JSON
This the default data reader if not specified in the data source. The format
of the data is similar to the dictionary used to create the data registry,
except without units.
For example::
{
"data": {
"DNI": [834, 523, 334, 34, 0, 0],
"zenith": [21, 28, 45, 79, 90, 90]
},
"param_file": "path/to/corresponding/param_file.json",
"data_source": "MyDataSource"
}
Parameters can be specified in a JSON file. ::
{
"DNI": {
"description": "direct normal insolation",
"units": "W/m*^2",
"isconstant": false
},
"zenith": {
"description": "solar zenith",
"units": "degrees",
"isconstant": false
}
}
Parameters can also be specified in the data source as class attributes. ::
class MyDataSrc(DataSource):
data_reader = JSONReader
DNI = {
"description": "direct normal insolation",
"units": "W/m*^2",
"isconstant": false
}
zenith = {
"description": "solar zenith",
"units": "degrees",
"isconstant": false
}
"""
def __init__(self, parameters, meta=None):
super(JSONReader, self).__init__(parameters, meta)
#: original data reader [None]
self.orig_data_reader = meta.data_reader
[docs] def load_data(self, filename, *args, **kwargs):
"""
Load JSON data.
:param filename: name of JSON file with data
:type filename: str
:return: data
:rtype: dict
"""
# append .json extension if needed
if not filename.endswith('.json'):
filename += '.json' # append "json" to filename
# open file and load JSON data
with open(filename, 'r') as fid:
json_data = json.load(fid)
# if JSONReader is the original reader then apply units and return
if (not self.orig_data_reader or
isinstance(self, self.orig_data_reader)):
return self.apply_units_to_cache(json_data['data'])
# last modification since JSON file was saved
utc_mod_time = json_data.get('utc_mod_time')
# instance of original data reader with original parameters
orig_data_reader_obj = self.orig_data_reader(self.parameters, self.meta)
# check if file has been modified since saved as JSON file
if utc_mod_time:
# convert to ordered tuple
utc_mod_time = time.struct_time(utc_mod_time)
orig_filename = filename[:-5] # original filename
# use original file if it's been modified since JSON file saved
if utc_mod_time < time.gmtime(os.path.getmtime(orig_filename)):
os.remove(filename) # delete JSON file
return orig_data_reader_obj.load_data(orig_filename)
# use JSON file if original file hasn't been modified
return orig_data_reader_obj.apply_units_to_cache(json_data['data'])
[docs] def apply_units_to_cache(self, data):
"""
Apply units to data read using :class:`JSONReader`.
:param data: cached data
:return: data with units applied
:rtype: :class:`~pint.unit.Quantity`
"""
for k, val in self.parameters.iteritems():
if 'units' in val:
data[k] = Q_(data[k], val.get('units'))
return data
class XLRDReader(DataReader):
"""
Read data using XLRD.
The :attr:`~DataReader.parameters` argument is a dictionary. Each item is a
dictionary that contains the following keys: "description", "units", "range"
and "sheet". The parameters argument should be the parameters argument
created by a :class:`~simkit.core.data_sources.DataSource` and the values
of each item should be a :class:`~simkit.core.data_sources.DataParameter`.
Therefore any non-parameter attributes like "range" and "sheet" would be in
"extras".
If the range is a ...
* single cell -- use [rowx, colx].
* 2-D range -- use 2 arrays, [start, stop], each with [rowx, colx].
* column slice -- use an array and an int, [slice, colx], in which slice is
[start-rowx, stop-rowx]. Set stop-rowx to ``None`` to read the rest of
the column after start-rowx.
* row slice -- use [rowx, slice] in which slice is [start-colx, stop-colx].
Set stop-colx to ``None`` to read the rest of the row after start-colx.
* column -- use [None, colx] or [[], colx]
* row -- use [rowx, None] or [rowx, []]
.. seealso::
`The xlrd Module <https://secure.simplistix.co.uk/svn/xlrd/tags/ \
0.7.3/xlrd/doc/xlrd.html>`_
Example of :attr:`~DataReader.parameters`::
parameters = {
"month": {
"description": "month of year",
"units": "month",
"range": [[2, 8762], 2],
"sheet": "Level 1 Outputs"
},
"day": {
"description": "day of month",
"units': "day",
"range': [[2, 8762], 3]}
"sheet": "Level 1 Outputs"
},
"PAC": {
"description": "AC power",
"units": "kW",
"range": [[2, 8762], 12],
"sheet": "Level 2 Outputs"
},
"PDC": {
"description": "DC power",
"units": "kW",
"range": [[2, 8762], 13],
"sheet": "Level 2 Outputs"
}
}
This loads "month" and "day" data from columns 2 and 3 in the "Level 1
Outputs" sheet and "PAC" and "PDC" data from columns 12 and 13 in the
"Level 2 Outputs" sheets. The units for each data set and a description is
also given. Each of the data columns is 8760 rows long, from row 2 to row
8762. Don't forget that indexing starts at 0, so row 2 is the 3rd row.
"""
def load_data(self, filename, *args, **kwargs):
"""
Load parameters from Excel spreadsheet.
:param filename: Name of Excel workbook with data.
:type filename: str
:returns: Data read from Excel workbook.
:rtype: dict
"""
# workbook read from file
workbook = open_workbook(filename, verbosity=True)
data = {} # an empty dictionary to store data
# iterate through sheets in parameters
# iterate through the parameters on each sheet
for param, pval in self.parameters.iteritems():
sheet = pval['extras']['sheet']
# get each worksheet from the workbook
worksheet = workbook.sheet_by_name(sheet)
# split the parameter's range elements
prng0, prng1 = pval['extras']['range']
# missing "units", json ``null`` and Python ``None`` all OK!
# convert to str from unicode, None to '' (dimensionless)
punits = str(pval.get('units') or '')
# replace None with empty list
if prng0 is None:
prng0 = []
if prng1 is None:
prng1 = []
# FIXME: Use duck-typing here instead of type-checking!
# if both elements in range are `int` then parameter is a cell
if isinstance(prng0, int) and isinstance(prng1, int):
datum = worksheet.cell_value(prng0, prng1)
# if the either element is a `list` then parameter is a slice
elif isinstance(prng0, list) and isinstance(prng1, int):
datum = worksheet.col_values(prng1, *prng0)
elif isinstance(prng0, int) and isinstance(prng1, list):
datum = worksheet.row_values(prng0, *prng1)
# if both elements are `list` then parameter is 2-D
else:
datum = []
for col in xrange(prng0[1], prng1[1]):
datum.append(worksheet.col_values(col, prng0[0],
prng1[0]))
# duck typing that datum is real
try:
npdatum = np.array(datum, dtype=np.float)
except ValueError as err:
# check for iterable:
# if `datum` can't be coerced to float, then it must be
# *string* & strings *are* iterables, so don't check!
# check for strings:
# data must be real or *all* strings!
# empty string, None or JSON null also OK
# all([]) == True but any([]) == False
if not datum:
data[param] = None # convert empty to None
elif all(isinstance(_, basestring) for _ in datum):
data[param] = datum # all str is OK (EG all 'TMY')
elif all(not _ for _ in datum):
data[param] = None # convert list of empty to None
else:
raise err # raise ValueError if not all real or str
else:
data[param] = npdatum * UREG(punits)
# FYI: only put one statement into try-except test otherwise
# might catch different error than expected. use ``else`` as
# option to execute only if exception *not* raised.
return data
def apply_units_to_cache(self, data):
"""
Apply units to cached data read using :class:`JSONReader`.
:param data: Cached data.
:type data: dict
:return: data with units
"""
# iterate through sheets in parameters
# iterate through the parameters on each sheet
for param, pval in self.parameters.iteritems():
# try to apply units
try:
data[param] *= UREG(str(pval.get('units') or ''))
except TypeError:
continue
return data
class NumPyLoadTxtReader(DataReader):
"""
Read data using :func:`numpy.loadtxt` function.
The :attr:`~DataReader.parameters` argument is a dictionary that must have
a "data" key. An additional "header" is optional; see :func:`_read_header`.
The "data" key provides arguments to :func:`numpy.loadtxt`. The "dtype" key
must be specified, as names are required for all data in SimKit. Some
of the other :func:`numpy.loadtxt` arguments: "delimiter" and "skiprows" can
also be specified as keys. In addition "units" can also be specified in a
dictionary in which the keys are the names of the data output by
:func:`numpy.loadtxt`. Converters are not permitted. The "usecols"
argument is also not used since :func:`numpy.loadtxt` states that "the
number of columns used must match the number of fields in the data-type"
and "dtype" is already specified. The other arguments, "fname", "comments",
"unpack" and "ndmin" are also not used.
Example of :attr:`~DataReader.parameters`::
parameters = {
'header': {
'delimiter': ',',
'fields': [
['Name', 'str'],
['Latitude', 'float', 'arcdegree'],
['Longitude', 'float', 'arcdegree']]},
'data': {
'dtype': [
['Date', '(3,)int'], ['Time', '(2,)int'],
['GHI', 'float'], ['DNI', 'float'], ['DHI', 'float']],
'units': {
'GHI': 'W/m**2', 'DNI': 'W/m**2', 'DHI': 'W/m**2'},
'usecols': [0, 1, 4, 7, 10]}}
This loads a header with 3 fields followed by 5 columns of data, converting
the 1st column, "Date", to a 3-element tuple of ``int`` and the 2nd column,
"Time", to a 2-element tuple of ``int``.
"""
def load_data(self, filename, *args, **kwargs):
"""
load data from text file.
:param filename: name of text file to read
:type filename: str
:returns: data read from file using :func:`numpy.loadtxt`
:rtype: dict
"""
# header keys
header_param = self.parameters.get('header') # default is None
# data keys
data_param = self.parameters['data'] # raises KeyError if no 'data'
dtype = data_param['dtype'] # raises KeyError if no 'dtype'
# convert to tuple and normal ASCII
_utf8_list_to_ascii_tuple(dtype) if dtype else None # -> tuple of str
delimiter = data_param.get('delimiter') # default is None
skiprows = data_param.get('skiprows') # default is None
data_units = data_param.get('units', {}) # default is an empty dict
data = {} # a dictionary for data
# open file for reading
with open(filename, 'r') as fid:
# read header
if header_param:
data.update(_read_header(fid, header_param))
fid.seek(0) # move cursor back to beginning
# read data
data_data = np.loadtxt(fid, dtype, delimiter=delimiter,
skiprows=skiprows)
# apply units
data.update(_apply_units(data_data, data_units, fid.name))
return data
def apply_units_to_cache(self, data):
"""
Apply units to data originally loaded by :class:`NumPyLoadTxtReader`.
"""
return _apply_units_to_numpy_data_readers(self.parameters, data)
class NumPyGenFromTxtReader(DataReader):
"""
Read data using :func:`numpy.genfromtxt` function.
The :attr:`~DataReader.parameters` argument is a dictionary that must have
a "data" key. An additional "header" is optional; see :func:`_read_header`.
The "data" key provides arguments to :func:`numpy.genfromtxt`. Either the
"dtype" or "names" key must be specified, as names are required for all
data in SimKit. Some of the other :func:`numpy.genfromtxt` arguments:
"delimiter", "skip_header", "usecols", "excludelist" and "deletechars" can
also be specified as keys. In addition "units" can also be specified in a
dictionary in which the keys are the names of the data output by
:func:`numpy.genfromtxt`. Converters are not permitted. The other
arguments, "fname", "comments", "skip_footer", "missing_values",
"filling_values", "defaultfmt", "autostrip", "replace_space",
"case_sensitive", "unpack", "usemask" and "invalid_raise" are also not
used.
If the data names are not specified in the "dtypes" key or "names" key,
then :meth:`~NumPyGenFromTxtReader.load_data` will raise an exception,
:exc:`~simkit.core.exceptions.UnnamedDataError`.
.. seealso::
`Importing data with genfromtxt \
<http://docs.scipy.org/doc/numpy/user/basics.io.genfromtxt.html>`_
Example of :attr:`~DataReader.parameters`::
parameters = {
'header': {
'delimiter': ' ',
'fields': [
['city', 'str'], ['state', 'str'],
["timezone", 'int'], ["elevation", 'int', 'meters']]},
'data': {
'delimiter': 4,
'names': ['DNI', 'DHI', 'GHI'],
'units': {'DNI': 'W/m**2', 'DHI': 'W/m**2', 'GHI': 'W/m**2'}}}
This loads a header that is delimited by whitespace, followed by data in
three fixed-width columns all 4-digit floats.
"""
def load_data(self, filename, *args, **kwargs):
"""
load data from text file.
:param filename: name of file to read
:type filename: str
:returns: data read from file using :func:`numpy.genfromtxt`
:rtype: dict
:raises: :exc:`~simkit.core.exceptions.UnnamedDataError`
"""
# header keys
header_param = self.parameters.get('header') # default is None
# data keys
data_param = self.parameters['data'] # raises KeyError if no 'data'
dtype = data_param.get('dtype') # default is None
# if not None convert to tuple and normal ASCII
_utf8_list_to_ascii_tuple(dtype) if dtype else None # -> tuple of str
delimiter = data_param.get('delimiter') # default is None
skip_header = data_param.get('skip_header') # default is None
usecols = data_param.get('usecols') # default is None
names = data_param.get('names') # default is None
names = [str(_) for _ in names] if names else None # -> str
excludelist = data_param.get('excludelist') # default is None
deletechars = data_param.get('deletechars') # default is None
data_units = data_param.get('units', {}) # default is an empty dict
# either dtype or names must be specified
if not (dtype or names):
raise UnnamedDataError(filename)
data = {} # a dictionary for data
# open file for reading
with open(filename, 'r') as fid:
# read header
if header_param:
data.update(_read_header(fid, header_param))
fid.seek(0) # move cursor back to beginning
# data
data_data = np.genfromtxt(fid, dtype, delimiter=delimiter,
skip_header=skip_header, usecols=usecols,
names=names, excludelist=excludelist,
deletechars=deletechars)
# apply units
data.update(_apply_units(data_data, data_units, fid.name))
return data
def apply_units_to_cache(self, data):
"""
Apply units to data originally loaded by :class:`NumPyLoadTxtReader`.
"""
return _apply_units_to_numpy_data_readers(self.parameters, data)
def _apply_units_to_numpy_data_readers(parameters, data):
"""
Apply units to data originally loaded by :class:`NumPyLoadTxtReader` or
:class:`NumPyGenFromTxtReader`.
:param parameters: Dictionary of data source parameters read from JSON
file.
:type parameters: dict
:param data: Dictionary of data read
"""
# apply header units
header_param = parameters.get('header') # default is None
# check for headers
if header_param:
fields = header_param['fields'] # header fields
# dictionary of header field parameters
header_fields = {field[0]: field[1:] for field in fields}
# loop over fieldnames
for k, val in header_fields.iteritems():
# check for units in header field parameters
if len(val) > 1:
data[k] *= UREG(str(val[1])) # apply units
# apply other data units
data_units = parameters['data'].get('units') # default is None
if data_units:
for k, val in data_units.iteritems():
data[k] *= UREG(str(val)) # apply units
return data
def _read_header(f, header_param):
"""
Read and parse data from 1st line of a file.
:param f: :func:`file` or :class:`~StringIO.StringIO` object from which to
read 1st line.
:type f: file
:param header_param: Parameters used to parse the data from the header.
Contains "delimiter" and "fields".
:type header_param: dict
:returns: Dictionary of data read from header.
:rtype: dict
:raises: :exc:`~simkit.core.exceptions.UnnamedDataError`
The **header_param** argument contains keys to read the 1st line of **f**.
If "delimiter" is ``None`` or missing, the default delimiter is a comma,
otherwise "delimiter" can be any single character, integer or sequence of
``int``.
* single character -- a delimiter
* single integer -- uniform fixed width
* sequence of ``int`` -- fixed widths, the number of fields should \
correspond to the length of the sequence.
The "fields" key is a list of (parameter-name, parameter-type[, parameter-
units]) lists.
"""
# default delimiter is a comma, can't be None
header_delim = str(header_param.get('delimiter', ','))
# don't allow unnamed fields
if 'fields' not in header_param:
raise UnnamedDataError(f.name)
header_fields = {field[0]: field[1:] for field in header_param['fields']}
# header_names can't be generator b/c DictReader needs list, and can't be
# dictionary b/c must be same order as 'fields' to match data readby csv
header_names = [field[0] for field in header_param['fields']]
# read header
header_str = StringIO(f.readline()) # read the 1st line
# use csv because it will preserve quoted fields with commas
# make a csv.DictReader from header string, use header names for
# fieldnames and set delimiter to header delimiter
header_reader = csv.DictReader(header_str, header_names,
delimiter=header_delim,
skipinitialspace=True)
data = header_reader.next() # parse the header dictionary
# iterate over items in data
for k, v in data.iteritems():
header_type = header_fields[k][0] # spec'd type
# whitelist header types
if isinstance(header_type, basestring):
if header_type.lower().startswith('int'):
header_type = int # coerce to integer
elif header_type.lower().startswith('long'):
header_type = long # coerce to long integer
elif header_type.lower().startswith('float'):
header_type = float # to floating decimal point
elif header_type.lower().startswith('str'):
header_type = str # coerce to string
elif header_type.lower().startswith('bool'):
header_type = bool # coerce to boolean
else:
raise TypeError('"%s" is not a supported type.' % header_type)
# WARNING! Use of `eval` considered harmful. `header_type` is read
# from JSON file, not secure input, could be used to exploit system
data[k] = header_type(v) # cast v to type
# check for units in 3rd element
if len(header_fields[k]) > 1:
units = UREG(str(header_fields[k][1])) # spec'd units
data[k] = data[k] * units # apply units
return data
def _apply_units(data_data, data_units, fname):
"""
Apply units to data.
:param data_data: NumPy structured array with data from fname.
:type data_data: :class:`numpy.ndarray`
:param data_units: Units of fields in data_data.
:type data_units: dict
:param fname: Name of file from which data_data was read.
:type fname: str
:returns: Dictionary of data with units applied.
:rtype: dict
:raises: :exc:`~simkit.core.exceptions.UnnamedDataError`
"""
data_names = data_data.dtype.names
# raise error if NumPy data doesn't have names
if not data_names:
raise UnnamedDataError(fname)
data = dict.fromkeys(data_names) # dictionary of data read by NumPy
# iterate over data read by NumPy
for data_name in data_names:
if data_name in data_units:
# if units specified in parameters, then convert to string
units = str(data_units[data_name])
data[data_name] = data_data[data_name] * UREG(units)
elif np.issubdtype(data_data[data_name].dtype, str):
# if no units specified and is string
data[data_name] = data_data[data_name].tolist()
else:
data[data_name] = data_data[data_name]
return data
def _utf8_list_to_ascii_tuple(utf8_list):
"""
Convert unicode strings in a list of lists to ascii in a list of tuples.
:param utf8_list: A nested list of unicode strings.
:type utf8_list: list
"""
for n, utf8 in enumerate(utf8_list):
utf8_list[n][0] = str(utf8[0])
utf8_list[n][1] = str(utf8[1])
utf8_list[n] = tuple(utf8)
class ParameterizedXLS(XLRDReader):
"""
Concatenate data from parameterized sheets.
:param parameters: Parameterization information.
All data in parameterized sheets must be vectors of only numbers.
"""
def __init__(self, parameters):
#: parameterizaton information
self.parameterization = parameters
new_parameters = {} # empty dict for sheet parameters
parameter_sheets = self.parameterization['parameter']['sheets']
for n, sheet in enumerate(parameter_sheets):
new_parameters[sheet] = {} # empty dictionary for sheet data
for k, v in self.parameterization['data'].iteritems():
new_parameters[sheet][k + '_' + str(n)] = v
super(ParameterizedXLS, self).__init__(new_parameters)
# filename is instance attribute of XLRDReader
def load_data(self, filename, *args, **kwargs):
"""
Load parameterized data from different sheets.
"""
# load parameterized data
data = super(ParameterizedXLS, self).load_data(filename)
# add parameter to data
parameter_name = self.parameterization['parameter']['name']
parameter_values = self.parameterization['parameter']['values']
parameter_units = str(self.parameterization['parameter']['units'])
data[parameter_name] = parameter_values * UREG(parameter_units)
# number of sheets
num_sheets = len(self.parameterization['parameter']['sheets'])
# parse and concatenate parameterized data
for key in self.parameterization['data']:
units = str(self.parameterization['data'][key].get('units')) or ''
datalist = []
for n in xrange(num_sheets):
k = key + '_' + str(n)
datalist.append(data[k].reshape((1, -1)))
data.pop(k) # remove unused data keys
data[key] = np.concatenate(datalist, axis=0) * UREG(units)
return data
def apply_units_to_cache(self, data):
"""
Apply units to :class:`ParameterizedXLS` data reader.
"""
# parameter
parameter_name = self.parameters['parameter']['name']
parameter_units = str(self.parameters['parameter']['units'])
data[parameter_name] *= UREG(parameter_units)
# data
self.parameters.pop('parameter')
return super(ParameterizedXLS, self).apply_units_to_cache(data)
class MixedTextXLS(XLRDReader):
"""
Get parameters from cells mixed with text by matching regex pattern.
:raises: :exc:`~simkit.core.exceptions.MixedTextNoMatchError`
Use this reader for spreadsheets that have numerical data mixed with text.
It uses the same parameter file as :class:`XLRDReader` with two additional
keys: "pattern" and "method". The "pattern" must be a valid regex pattern.
Remember to escape backslashes. The "method" must be one of the following
regex methods from :mod:`re`:
* :func:`~re.match`
* :func:`~re.search`
* :func:`~re.split`
* :func:`~re.findall`
The default method is :func:`re.search` and the default pattern searches
for any number represented by the FORTRAN formatters "%e", "%E", "%f" or
"%g". This will find one number in any of the formats anywhere in the text
of the cell(s) read.
Example::
{
"Sheet1": {
"sigma_bypass_diode": {
"range": [15, 1],
"pattern":
"\\w+ = ([-+]?(?:\\d+(?:\\.\\d*)?|\\.\\d+)(?:[eE][-+]?\\d+)?)",
"method": "match"
},
"B_bypass_diode": {
"range": [16, 1],
"method": "findall"
},
"C_bypass_diode": {
"range": [17, 1],
"pattern": "\((\\d+), (\\d+), (\\d+)\)",
"method": "search"
},
"cov_bypass_diode": {
"range": [18, 1],
"pattern": "[,;]",
"method": "split"
}
}
}
These examples all read from "Sheet1". The first example matches one or
more alphanumeric characters at the beginning of the string set equal to an
integer, decimal or number in scientific notation, such as "Std = 0.4985"
from cell B16. The second example finds all numbers matching the default
pattern in cell B17. The third example searches for 3 integers in
parenthesis separated by commas anywhere in cell B18. The last example
splits a string delimited by commas and semicolons in cell B19.
If no match is found then
:exc:`~simkit.core.exceptions.MixedTextNoMatchError`
is raised. Only numbers can be read, and any single-dimensions will be
squeezed out. For example scalars will become 0-d arrays.
"""
def load_data(self, filename, *args, **kwargs):
"""
Load text data from different sheets.
"""
# load text data
data = super(MixedTextXLS, self).load_data(filename)
# iterate through sheets in parameters
for sheet_params in self.parameters.itervalues():
# iterate through the parameters on each sheet
for param, pval in sheet_params.iteritems():
pattern = pval.get('pattern', EFG_PATTERN) # get pattern
re_meth = pval.get('method', 'search') # get re method
# whitelist re methods, getattr could be considered harmful
if re_meth in RE_METH:
re_meth = getattr(re, pval.get('method', 'search'))
else:
msg = 'Only', '"%s", ' * len(RE_METH) % tuple(RE_METH)
msg += 'regex methods are allowed.'
raise AttributeError(msg)
# if not isinstance(data[param], basestring):
# re_meth = lambda p, dp: [re_meth(p, d) for d in dp]
match = re_meth(pattern, data[param]) # get matches
if match:
try:
match = match.groups()
except AttributeError:
match = [m.groups() for m in match]
npdata = np.array(match, dtype=float).squeeze()
data[param] = npdata * UREG(str(pval.get('units') or ''))
else:
raise MixedTextNoMatchError(re_meth, pattern, data[param])
return data