0 loading data
import numpy as np
import pandas as pd
from sklearn.datasets import make_regression
%matplotlib inline
%load_ext autoreload
%autoreload 2
Reading Datasets¶
Below will demonstrate 3 different types of datasets that you may encounter and how to load them.
- .csv/.txt file (Arrays) - Done
- .csv/.txt file (Labeled Data)
- .netcdf - Spatial-Temporal Data
# Make Fake Dataset
X, y = make_regression(
n_samples=1000,
n_features=100, # Total Features
n_informative=10, # Informative Features
n_targets=20,
bias=100,
noise=0.8,
random_state=123
)
# Print shape for confirmation
print(X.shape, y.shape)
# Save Data
np.savetxt("/home/emmanuel/projects/2019_ocean/data/raw/sample_array_data.csv", X, delimiter=",")
np.savetxt("/home/emmanuel/projects/2019_ocean/data/raw/sample_array_labels.csv", y, delimiter=",")
# Delete the variables X, y
del X, y
# # Training and Testing
# xtrain, xtest, ytrain, ytest = train_test_split(X, y, train_size=5000, random_state=123)
X = np.loadtxt("/home/emmanuel/projects/2019_ocean/data/raw/sample_array_data.csv", delimiter=",")
y = np.loadtxt("/home/emmanuel/projects/2019_ocean/data/raw/sample_array_labels.csv", delimiter=",")
# Print shape for confirmation
print(X.shape, y.shape)