[python] memory error

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88

"""
convert the merck data-set suitable to be fead to the CNN
 
1) remove columns that does not appear in both 
training and test
2) normalize the activation to have zero mean and 1 std (z-score)
3) rescale the features to 0-1 by dividing each column by its training max or y = log(x+1)
 
"""
 
import pandas as pd
import numpy as np
from nutsflow import *
from nutsml import *
import sys
 
data_root = './DATA/merck/'
save_root = './DATA/merck/preprocessed/'
FEATURE_SCALE = 'log'   # 'uniform'
 
 
dataset_names = ['3A4', 'CB1', 'DPP4', 'HIVINT', 'HIVPROT', 'LOGD', 'METAB', 'NK1', 'OX1', 'OX2', 'PGP', 'PPB', 'RAT_F', 'TDI', 'THROMBIN']
 
stat_hold = list() # hold the mean and standard deviation for each data-set
 
for dataset_name in dataset_names:
 
    test_filename = data_root + dataset_name + '_test_disguised.csv'
    train_filename = data_root + dataset_name + '_training_disguised.csv'
 
    test_filename_save = save_root + dataset_name + '_test_disguised.csv'
    train_filename_save = save_root + dataset_name + '_training_disguised.csv'
 
    print 'Preprocessing dataset ', dataset_name
 
    train = pd.read_csv(train_filename)
    test = pd.read_csv(test_filename)
 
    print len(train.columns.values)
    print len(test.columns.values)
 
    train_inx_set = set(train.columns.values)
    test_inx_set = set(test.columns.values)
 
    # remove columns that are not common to both training and test sets
    train_inx = [inx for inx in train.columns.values if inx in set.intersection(train_inx_set, test_inx_set)]
    test_inx = [inx for inx in test.columns.values if inx in set.intersection(train_inx_set, test_inx_set)]
 
    # print train_inx
    # print test_inx
 
    train = train[train_inx]
    test = test[test_inx]
 
    print train.shape
    print test.shape
 
    # Normalize activations
    X = np.asarray(train.Act)
    x_mean = np.mean(X)
    x_std = np.std(X)
 
    stat_hold.append((dataset_name, x_mean, x_std))
 
    train.Act = (train.Act - x_mean) / x_std
    test.Act = (test.Act - x_mean) / x_std
 
    # rescale features
    if FEATURE_SCALE == 'log':
        train.ix[:, 2:] = np.log(train.ix[:, 2:] + 1)
        test.ix[:, 2:] = np.log(test.ix[:, 2:] + 1)
 
    elif FEATURE_SCALE == 'uniform':
        max_feature = train.max(axis=0)[2:]
        train.ix[:, 2:] = train.ix[:, 2:] / max_feature
        test.ix[:, 2:] = test.ix[:, 2:] / max_feature
    else:
        sys.exit("Feature normalization method not defined correctly, check FEATURE_SCALE. ")
 
    # save data to csv
    train.to_csv(train_filename_save, index=False)
    test.to_csv(test_filename_save, index=False)
 
    print 'Done dataset ', dataset_name
 
writer = WriteCSV(save_root + 'dataset_stats.csv')
stat_hold >> writer
 
http://colorscripter.com/info#e" target="_blank" style="color:#e5e5e5text-decoration:none">Colored by Color Scripter

http://colorscripter.com/info#e" target="_blank" style="text-decoration:none;color:white">cs

에러가 뜬다. 왜일까?? 뭘까??

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84

Preprocessing dataset  3A4
 
---------------------------------------------------------------------------
MemoryError                               Traceback (most recent call last)
<ipython-input-1-1aeca87bf78b> in <module>()
     34     print 'Preprocessing dataset ', dataset_name
     35 
---> 36     train = pd.read_csv(train_filename)
     37     test = pd.read_csv(test_filename)
     38 
 
/home/uosest/.conda/envs/py27/lib/python2.7/site-packages/pandas/io/parsers.pyc in parser_f(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, dialect, tupleize_cols, error_bad_lines, warn_bad_lines, delim_whitespace, low_memory, memory_map, float_precision)
    700                     skip_blank_lines=skip_blank_lines)
    701 
--> 702         return _read(filepath_or_buffer, kwds)
    703 
    704     parser_f.__name__ = name
 
/home/uosest/.conda/envs/py27/lib/python2.7/site-packages/pandas/io/parsers.pyc in _read(filepath_or_buffer, kwds)
    433 
    434     try:
--> 435         data = parser.read(nrows)
    436     finally:
    437         parser.close()
 
/home/uosest/.conda/envs/py27/lib/python2.7/site-packages/pandas/io/parsers.pyc in read(self, nrows)
   1152             new_rows = len(index)
   1153 
-> 1154         df = DataFrame(col_dict, columns=columns, index=index)
   1155 
   1156         self._currow += new_rows
 
/home/uosest/.conda/envs/py27/lib/python2.7/site-packages/pandas/core/frame.pyc in __init__(self, data, index, columns, dtype, copy)
    390                                  dtype=dtype, copy=copy)
    391         elif isinstance(data, dict):
--> 392             mgr = init_dict(data, index, columns, dtype=dtype)
    393         elif isinstance(data, ma.MaskedArray):
    394             import numpy.ma.mrecords as mrecords
 
/home/uosest/.conda/envs/py27/lib/python2.7/site-packages/pandas/core/internals/construction.pyc in init_dict(data, index, columns, dtype)
    210         arrays = [data[k] for k in keys]
    211 
--> 212     return arrays_to_mgr(arrays, data_names, index, columns, dtype=dtype)
    213 
    214 
 
/home/uosest/.conda/envs/py27/lib/python2.7/site-packages/pandas/core/internals/construction.pyc in arrays_to_mgr(arrays, arr_names, index, columns, dtype)
     59     axes = [ensure_index(columns), index]
     60 
---> 61     return create_block_manager_from_arrays(arrays, arr_names, axes)
     62 
     63 
 
/home/uosest/.conda/envs/py27/lib/python2.7/site-packages/pandas/core/internals/managers.pyc in create_block_manager_from_arrays(arrays, names, axes)
   1664 
   1665     try:
-> 1666         blocks = form_blocks(arrays, names, axes)
   1667         mgr = BlockManager(blocks, axes)
   1668         mgr._consolidate_inplace()
 
/home/uosest/.conda/envs/py27/lib/python2.7/site-packages/pandas/core/internals/managers.pyc in form_blocks(arrays, names, axes)
   1732 
   1733     if len(items_dict['IntBlock']):
-> 1734         int_blocks = _multi_blockify(items_dict['IntBlock'])
   1735         blocks.extend(int_blocks)
   1736 
 
/home/uosest/.conda/envs/py27/lib/python2.7/site-packages/pandas/core/internals/managers.pyc in _multi_blockify(tuples, dtype)
   1817     for dtype, tup_block in grouper:
   1818 
-> 1819         values, placement = _stack_arrays(list(tup_block), dtype)
   1820 
   1821         block = make_block(values, placement=placement)
 
/home/uosest/.conda/envs/py27/lib/python2.7/site-packages/pandas/core/internals/managers.pyc in _stack_arrays(tuples, dtype)
   1859     shape = (len(arrays),) + _shape_compat(first)
   1860 
-> 1861     stacked = np.empty(shape, dtype=dtype)
   1862     for i, arr in enumerate(arrays):
   1863         stacked[i] = _asarray_compat(arr)
 
MemoryError: 
 
 
http://colorscripter.com/info#e" target="_blank" style="color:#e5e5e5text-decoration:none">Colored by Color Scripter

http://colorscripter.com/info#e" target="_blank" style="text-decoration:none;color:white">cs

메모리가 부족한가 보다. 8기가 밖에 할당을 안해서!?--> 8기가는 택도없구나

Champion Program

[python] memory error

티스토리툴바