Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 22 additions & 6 deletions examples/benchmarks/LightGBM/multi_freq_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,21 @@
import pandas as pd

from qlib.data.dataset.loader import QlibDataLoader
from qlib.contrib.data.handler import DataHandlerLP, _DEFAULT_LEARN_PROCESSORS, check_transform_proc
from qlib.contrib.data.handler import (
DataHandlerLP,
_DEFAULT_LEARN_PROCESSORS,
check_transform_proc,
)


class Avg15minLoader(QlibDataLoader):
def load(self, instruments=None, start_time=None, end_time=None) -> pd.DataFrame:
df = super(Avg15minLoader, self).load(instruments, start_time, end_time)
if self.is_group:
# feature_day(day freq) and feature_15min(1min freq, Average every 15 minutes) renamed feature
df.columns = df.columns.map(lambda x: ("feature", x[1]) if x[0].startswith("feature") else x)
df.columns = df.columns.map(
lambda x: ("feature", x[1]) if x[0].startswith("feature") else x
)
return df


Expand All @@ -32,10 +38,17 @@ def __init__(
inst_processors=None,
**kwargs,
):
infer_processors = check_transform_proc(infer_processors, fit_start_time, fit_end_time)
learn_processors = check_transform_proc(learn_processors, fit_start_time, fit_end_time)
infer_processors = check_transform_proc(
infer_processors, fit_start_time, fit_end_time
)
learn_processors = check_transform_proc(
learn_processors, fit_start_time, fit_end_time
)
data_loader = Avg15minLoader(
config=self.loader_config(), filter_pipe=filter_pipe, freq=freq, inst_processors=inst_processors
config=self.loader_config(),
filter_pipe=filter_pipe,
freq=freq,
inst_processors=inst_processors,
)
super().__init__(
instruments=instruments,
Expand Down Expand Up @@ -123,7 +136,10 @@ def loader_config(self):
tmp_names = []
for i, _f in enumerate(fields):
_fields = [f"Ref(Mean({_f}, 15), {j * 15})" for j in range(1, 240 // 15)]
_names = [f"{names[i][:-1]}{int(names[i][-1])+j}" for j in range(240 // 15 - 1, 0, -1)]
_names = [
f"{names[i][:-1]}{int(names[i][-1])+j}"
for j in range(240 // 15 - 1, 0, -1)
]
_fields.append(f"Mean({_f}, 15)")
_names.append(f"{names[i][:-1]}{int(names[i][-1])+240 // 15}")
tmp_fields += _fields
Expand Down
49 changes: 38 additions & 11 deletions examples/benchmarks/TFT/data_formatters/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,11 @@ def _check_single_column(input_type):
length = len([tup for tup in column_definition if tup[2] == input_type])

if length != 1:
raise ValueError("Illegal number of inputs ({}) of type {}".format(length, input_type))
raise ValueError(
"Illegal number of inputs ({}) of type {}".format(
length, input_type
)
)

_check_single_column(InputTypes.ID)
_check_single_column(InputTypes.TIME)
Expand All @@ -152,45 +156,66 @@ def _check_single_column(input_type):
real_inputs = [
tup
for tup in column_definition
if tup[1] == DataTypes.REAL_VALUED and tup[2] not in {InputTypes.ID, InputTypes.TIME}
if tup[1] == DataTypes.REAL_VALUED
and tup[2] not in {InputTypes.ID, InputTypes.TIME}
]
categorical_inputs = [
tup
for tup in column_definition
if tup[1] == DataTypes.CATEGORICAL and tup[2] not in {InputTypes.ID, InputTypes.TIME}
if tup[1] == DataTypes.CATEGORICAL
and tup[2] not in {InputTypes.ID, InputTypes.TIME}
]

return identifier + time + real_inputs + categorical_inputs

def _get_input_columns(self):
"""Returns names of all input columns."""
return [tup[0] for tup in self.get_column_definition() if tup[2] not in {InputTypes.ID, InputTypes.TIME}]
return [
tup[0]
for tup in self.get_column_definition()
if tup[2] not in {InputTypes.ID, InputTypes.TIME}
]

def _get_tft_input_indices(self):
"""Returns the relevant indexes and input sizes required by TFT."""

# Functions
def _extract_tuples_from_data_type(data_type, defn):
return [tup for tup in defn if tup[1] == data_type and tup[2] not in {InputTypes.ID, InputTypes.TIME}]
return [
tup
for tup in defn
if tup[1] == data_type
and tup[2] not in {InputTypes.ID, InputTypes.TIME}
]

def _get_locations(input_types, defn):
return [i for i, tup in enumerate(defn) if tup[2] in input_types]

# Start extraction
column_definition = [
tup for tup in self.get_column_definition() if tup[2] not in {InputTypes.ID, InputTypes.TIME}
tup
for tup in self.get_column_definition()
if tup[2] not in {InputTypes.ID, InputTypes.TIME}
]

categorical_inputs = _extract_tuples_from_data_type(DataTypes.CATEGORICAL, column_definition)
real_inputs = _extract_tuples_from_data_type(DataTypes.REAL_VALUED, column_definition)
categorical_inputs = _extract_tuples_from_data_type(
DataTypes.CATEGORICAL, column_definition
)
real_inputs = _extract_tuples_from_data_type(
DataTypes.REAL_VALUED, column_definition
)

locations = {
"input_size": len(self._get_input_columns()),
"output_size": len(_get_locations({InputTypes.TARGET}, column_definition)),
"category_counts": self.num_classes_per_cat_input,
"input_obs_loc": _get_locations({InputTypes.TARGET}, column_definition),
"static_input_loc": _get_locations({InputTypes.STATIC_INPUT}, column_definition),
"known_regular_inputs": _get_locations({InputTypes.STATIC_INPUT, InputTypes.KNOWN_INPUT}, real_inputs),
"static_input_loc": _get_locations(
{InputTypes.STATIC_INPUT}, column_definition
),
"known_regular_inputs": _get_locations(
{InputTypes.STATIC_INPUT, InputTypes.KNOWN_INPUT}, real_inputs
),
"known_categorical_inputs": _get_locations(
{InputTypes.STATIC_INPUT, InputTypes.KNOWN_INPUT}, categorical_inputs
),
Expand All @@ -213,7 +238,9 @@ def get_experiment_params(self):

for k in required_keys:
if k not in fixed_params:
raise ValueError("Field {}".format(k) + " missing from fixed parameter definitions!")
raise ValueError(
"Field {}".format(k) + " missing from fixed parameter definitions!"
)

fixed_params["column_definition"] = self.get_column_definition()

Expand Down
12 changes: 9 additions & 3 deletions examples/benchmarks/TFT/data_formatters/qlib_Alpha158.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,8 +110,12 @@ def set_scalers(self, df):
print("Setting scalers with training data...")

column_definitions = self.get_column_definition()
id_column = utils.get_single_col_by_input_type(InputTypes.ID, column_definitions)
target_column = utils.get_single_col_by_input_type(InputTypes.TARGET, column_definitions)
id_column = utils.get_single_col_by_input_type(
InputTypes.ID, column_definitions
)
target_column = utils.get_single_col_by_input_type(
InputTypes.TARGET, column_definitions
)

# Extract identifiers in case required
self.identifiers = list(df[id_column].unique())
Expand All @@ -137,7 +141,9 @@ def set_scalers(self, df):
for col in categorical_inputs:
# Set all to str so that we don't have mixed integer/string columns
srs = df[col].apply(str)
categorical_scalers[col] = sklearn.preprocessing.LabelEncoder().fit(srs.values)
categorical_scalers[col] = sklearn.preprocessing.LabelEncoder().fit(
srs.values
)
num_classes.append(srs.nunique())

# Set categorical scaler outputs
Expand Down
11 changes: 9 additions & 2 deletions examples/benchmarks/TFT/expt_settings/configs.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,9 @@ def __init__(self, experiment="volatility", root_folder=None):

# Defines all relevant paths
if root_folder is None:
root_folder = os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", "outputs")
root_folder = os.path.join(
os.path.dirname(os.path.realpath(__file__)), "..", "outputs"
)
print("Using root folder {}".format(root_folder))

self.root_folder = root_folder
Expand All @@ -64,7 +66,12 @@ def __init__(self, experiment="volatility", root_folder=None):
self.results_folder = os.path.join(root_folder, "results", experiment)

# Creates folders if they don't exist
for relevant_directory in [self.root_folder, self.data_folder, self.model_folder, self.results_folder]:
for relevant_directory in [
self.root_folder,
self.data_folder,
self.model_folder,
self.results_folder,
]:
if not os.path.exists(relevant_directory):
os.makedirs(relevant_directory)

Expand Down
45 changes: 36 additions & 9 deletions examples/benchmarks/TFT/libs/hyperparam_opt.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,9 @@ class HyperparamOptManager:
hyperparam_folder: Where to save optimisation outputs.
"""

def __init__(self, param_ranges, fixed_params, model_folder, override_w_fixed_params=True):
def __init__(
self, param_ranges, fixed_params, model_folder, override_w_fixed_params=True
):
"""Instantiates model.

Args:
Expand Down Expand Up @@ -136,9 +138,17 @@ def _check_params(self, params):
missing_fields = [k for k in valid_fields if k not in params]

if invalid_fields:
raise ValueError("Invalid Fields Found {} - Valid ones are {}".format(invalid_fields, valid_fields))
raise ValueError(
"Invalid Fields Found {} - Valid ones are {}".format(
invalid_fields, valid_fields
)
)
if missing_fields:
raise ValueError("Missing Fields Found {} - Valid ones are {}".format(missing_fields, valid_fields))
raise ValueError(
"Missing Fields Found {} - Valid ones are {}".format(
missing_fields, valid_fields
)
)

def _get_name(self, params):
"""Returns a unique key for the supplied set of params."""
Expand Down Expand Up @@ -168,7 +178,9 @@ def get_next_parameters(self, ranges_to_skip=None):
def _get_next():
"""Returns next hyperparameter set per try."""

parameters = {k: np.random.choice(self.param_ranges[k]) for k in param_range_keys}
parameters = {
k: np.random.choice(self.param_ranges[k]) for k in param_range_keys
}

# Adds fixed params
for k in self.fixed_params:
Expand Down Expand Up @@ -265,7 +277,9 @@ def __init__(
# Sanity checks
if worker_number > max_workers:
raise ValueError(
"Worker number ({}) cannot be larger than the total number of workers!".format(max_workers)
"Worker number ({}) cannot be larger than the total number of workers!".format(
max_workers
)
)
if worker_number > search_iterations:
raise ValueError(
Expand All @@ -274,10 +288,16 @@ def __init__(
)
)

print("*** Creating hyperparameter manager for worker {} ***".format(worker_number))
print(
"*** Creating hyperparameter manager for worker {} ***".format(
worker_number
)
)

hyperparam_folder = os.path.join(root_model_folder, str(worker_number))
super().__init__(param_ranges, fixed_params, hyperparam_folder, override_w_fixed_params=True)
super().__init__(
param_ranges, fixed_params, hyperparam_folder, override_w_fixed_params=True
)

serialised_ranges_folder = os.path.join(root_model_folder, "hyperparams")
if clear_serialised_params:
Expand All @@ -287,7 +307,9 @@ def __init__(

utils.create_folder_if_not_exist(serialised_ranges_folder)

self.serialised_ranges_path = os.path.join(serialised_ranges_folder, "ranges_{}.csv".format(search_iterations))
self.serialised_ranges_path = os.path.join(
serialised_ranges_folder, "ranges_{}.csv".format(search_iterations)
)
self.hyperparam_folder = hyperparam_folder # override
self.worker_num = worker_number
self.total_search_iterations = search_iterations
Expand Down Expand Up @@ -421,7 +443,12 @@ def assign_worker_numbers(self, df):

max_worker_num = int(np.ceil(n / batch_size))

worker_idx = np.concatenate([np.tile(i + 1, self.num_iterations_per_worker) for i in range(max_worker_num)])
worker_idx = np.concatenate(
[
np.tile(i + 1, self.num_iterations_per_worker)
for i in range(max_worker_num)
]
)

output["worker"] = worker_idx[: len(output)]

Expand Down
Loading
Loading