1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
|
"""
convert the merck data-set suitable to be fead to the CNN
1) remove columns that does not appear in both
training and test
2) normalize the activation to have zero mean and 1 std (z-score)
3) rescale the features to 0-1 by dividing each column by its training max or y = log(x+1)
"""
import pandas as pd
import numpy as np
from nutsflow import *
from nutsml import *
import sys
data_root = './DATA/merck/'
save_root = './DATA/merck/preprocessed/'
FEATURE_SCALE = 'log' # 'uniform'
dataset_names = ['3A4', 'CB1', 'DPP4', 'HIVINT', 'HIVPROT', 'LOGD', 'METAB', 'NK1', 'OX1', 'OX2', 'PGP', 'PPB', 'RAT_F', 'TDI', 'THROMBIN']
stat_hold = list() # hold the mean and standard deviation for each data-set
for dataset_name in dataset_names:
print 'Preprocessing dataset ', dataset_name
train = pd.read_csv(train_filename)
test = pd.read_csv(test_filename)
train_inx_set = set(train.columns.values)
test_inx_set = set(test.columns.values)
# remove columns that are not common to both training and test sets
train_inx = [inx for inx in train.columns.values if inx in set.intersection(train_inx_set, test_inx_set)]
test_inx = [inx for inx in test.columns.values if inx in set.intersection(train_inx_set, test_inx_set)]
# print train_inx
# print test_inx
train = train[train_inx]
test = test[test_inx]
print train.shape
print test.shape
# Normalize activations
X = np.asarray(train.Act)
x_mean = np.mean(X)
x_std = np.std(X)
# rescale features
if FEATURE_SCALE == 'log':
elif FEATURE_SCALE == 'uniform':
else:
sys.exit("Feature normalization method not defined correctly, check FEATURE_SCALE. ")
# save data to csv
train.to_csv(train_filename_save, index=False)
test.to_csv(test_filename_save, index=False)
print 'Done dataset ', dataset_name
stat_hold >> writer
http://colorscripter.com/info#e" target="_blank" style="color:#e5e5e5text-decoration:none">Colored by Color Scripter
|
http://colorscripter.com/info#e" target="_blank" style="text-decoration:none;color:white">cs |
에러가 뜬다. 왜일까?? 뭘까??
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
|
Preprocessing dataset 3A4
---------------------------------------------------------------------------
MemoryError Traceback (most recent call last)
<ipython-input-1-1aeca87bf78b> in <module>()
34 print 'Preprocessing dataset ', dataset_name
35
---> 36 train = pd.read_csv(train_filename)
37 test = pd.read_csv(test_filename)
38
/home/uosest/.conda/envs/py27/lib/python2.7/site-packages/pandas/io/parsers.pyc in parser_f(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, dialect, tupleize_cols, error_bad_lines, warn_bad_lines, delim_whitespace, low_memory, memory_map, float_precision)
700 skip_blank_lines=skip_blank_lines)
701
--> 702 return _read(filepath_or_buffer, kwds)
703
704 parser_f.__name__ = name
/home/uosest/.conda/envs/py27/lib/python2.7/site-packages/pandas/io/parsers.pyc in _read(filepath_or_buffer, kwds)
433
434 try:
436 finally:
437 parser.close()
/home/uosest/.conda/envs/py27/lib/python2.7/site-packages/pandas/io/parsers.pyc in read(self, nrows)
1152 new_rows = len(index)
1153
-> 1154 df = DataFrame(col_dict, columns=columns, index=index)
1155
1156 self._currow += new_rows
/home/uosest/.conda/envs/py27/lib/python2.7/site-packages/pandas/core/frame.pyc in __init__(self, data, index, columns, dtype, copy)
390 dtype=dtype, copy=copy)
391 elif isinstance(data, dict):
--> 392 mgr = init_dict(data, index, columns, dtype=dtype)
393 elif isinstance(data, ma.MaskedArray):
/home/uosest/.conda/envs/py27/lib/python2.7/site-packages/pandas/core/internals/construction.pyc in init_dict(data, index, columns, dtype)
210 arrays = [data[k] for k in keys]
211
--> 212 return arrays_to_mgr(arrays, data_names, index, columns, dtype=dtype)
213
214
/home/uosest/.conda/envs/py27/lib/python2.7/site-packages/pandas/core/internals/construction.pyc in arrays_to_mgr(arrays, arr_names, index, columns, dtype)
59 axes = [ensure_index(columns), index]
60
---> 61 return create_block_manager_from_arrays(arrays, arr_names, axes)
62
63
/home/uosest/.conda/envs/py27/lib/python2.7/site-packages/pandas/core/internals/managers.pyc in create_block_manager_from_arrays(arrays, names, axes)
1664
1665 try:
-> 1666 blocks = form_blocks(arrays, names, axes)
1667 mgr = BlockManager(blocks, axes)
1668 mgr._consolidate_inplace()
/home/uosest/.conda/envs/py27/lib/python2.7/site-packages/pandas/core/internals/managers.pyc in form_blocks(arrays, names, axes)
1732
1733 if len(items_dict['IntBlock']):
-> 1734 int_blocks = _multi_blockify(items_dict['IntBlock'])
1736
/home/uosest/.conda/envs/py27/lib/python2.7/site-packages/pandas/core/internals/managers.pyc in _multi_blockify(tuples, dtype)
1817 for dtype, tup_block in grouper:
1818
-> 1819 values, placement = _stack_arrays(list(tup_block), dtype)
1820
1821 block = make_block(values, placement=placement)
/home/uosest/.conda/envs/py27/lib/python2.7/site-packages/pandas/core/internals/managers.pyc in _stack_arrays(tuples, dtype)
1859 shape = (len(arrays),) + _shape_compat(first)
1860
1862 for i, arr in enumerate(arrays):
1863 stacked[i] = _asarray_compat(arr)
MemoryError:
http://colorscripter.com/info#e" target="_blank" style="color:#e5e5e5text-decoration:none">Colored by Color Scripter
|
http://colorscripter.com/info#e" target="_blank" style="text-decoration:none;color:white">cs |
메모리가 부족한가 보다. 8기가 밖에 할당을 안해서!?--> 8기가는 택도없구나