Memory efficient pytorch

Memory Efﬁcient Pytorch
SNU RPLab

Hyungjoo Cho

Computation Graph
w₁ x₁ w₂ x₂ b
z
h
L
y

Computation Graph
w₁ x₁ w₂ x₂ b
z
h
yL

Computation Graph
w₁ x₁ w₂ x₂
z
h
yL
b

Network Conﬁguration
INPUT
FC
Sigmoid
LogSigmoid Label

Gradient Calculation Graph
INPUT
FC-forward
Sigm-forward
LogSigm-forward LogSigm-backward
FC-backward
Sigm-backward
Label
INPUT-Grad

Memory Allocation
INPUT
FC-forward
Sigm-forward
FC-backward
Sigm-backward
Label
INPUT-Grad

Efﬁcient Memory Allocation
INPUT
FC-forward
Sigm-forward
FC-backward
Sigm-backward
Label
INPUT-Grad

INPUT
FC-forward
Sigm-forward
FC-backward
Sigm-backward
Label
INPUT-Grad
①
②
1. refcount(①) is 1.
2. size(output(①))  
is same as
size(output(②)).

INPUT
FC-forward
Sigm-forward
FC-backward
Sigm-backward
Label
INPUT-Grad
①
②
1. refcount(①) is 1.
2. size(output(①))  
is same as
size(output(②)).
In-place operation

In-place Operation
• The input is usually overwritten by the output as the algorithm executes.

• In-place operation updates input sequence only through replacement or
swapping of elements.

a = torch.zeros(1, 3)
In-place in Pytorch

print(a)
print(hex(id(a)))
In-place in Pytorch

print(a)
print(hex(id(a)))
0 0 0
[torch.FloatTensor of size 1x3]
0x7f4b08813188
In-place in Pytorch

print(a)
print(hex(id(a)))
0 0 0
0x7f4b08813188
0x7f4b08813188 FloatTensor([0, 0, 0])a
In-place in Pytorch

case 1)
a = a + 1
print(a.numpy())
print(hex(id(a)))
case 2)
for i in range(3):
a[:, i] += 1
print(a.numpy())
print(hex(id(a)))
In-place in Pytorch

case 1)
a = a + 1
print(a.numpy())
print(hex(id(a)))
[[1. 1. 1.]]
0x7f4b088135c8
case 2)
for i in range(3):
a[:, i] += 1
print(a.numpy())
print(hex(id(a)))
[[1. 1. 1.]]
0x7f4b08813188
In-place in Pytorch

case 1)
case 2)
0x7f4b08813188 FloatTensor([0, 0, 0])
a
0x7f4b088135c8 FloatTensor([1, 1, 1])
0x7f4b08813188
FloatTensor([0, 0, 0])
a
In-place in Pytorch

case 1)
case 2)
0x7f4b08813188 FloatTensor([0, 0, 0])
a
0x7f4b088135c8 FloatTensor([1, 1, 1])
0x7f4b08813188
a
Out of place In-place
In-place in Pytorch

case 3)
a = a.add(1)
print(a.numpy())
print(hex(id(a)))
case 4)
a.add_(1)
print(a.numpy())
print(hex(id(a)))
In-place in Pytorch

case 3)
a = a.add(1)
print(a.numpy())
print(hex(id(a)))
[[1. 1. 1.]]
0x7f4b088135c8
case 4)
a.add_(1)
print(a.numpy())
print(hex(id(a)))
[[1. 1. 1.]]
0x7f4b08813188
In-place in Pytorch
Out of place In-place

case 5)
a += 1
print(a.numpy())
print(hex(id(a)))
In-place in Pytorch

case 5)
a += 1
print(a.numpy())
print(hex(id(a)))
[[1. 1. 1.]]
0x7f4b08813188
In-place in Pytorch
In-place

case 5)
a += 1
print(a.numpy())
print(hex(id(a)))
[[1. 1. 1.]]
0x7f4b08813188
In-place in Pytorch
In-place
torch/autograd/variable.py::Variable( )

a = a.fill(3)
In-place in Pytorch

a = a.fill(3)
AttributeError: ‘torch.FloatTensor’
object has no attribute ‘fill’
In-place in Pytorch

a.fill_(3)
print(a.numpy())
[[3. 3. 3.]]
In-place in Pytorch

In-place in Pytorch
In-place ‘only’
- fill_()
- zero_()
- normal_()
- uniform_()
- exponential_()
- etc…

In-place in Pytorch
a = a.t_()
3.
3.
3.
Diﬀerent tensor size, but same buﬀer size

Non-linear Activation
self.add_module(‘conv’, conv2d_3x3(in_dim, out_dim))
self.add_module(‘bn’, nn.BatchNorm2d(in_dim)
self.add_module(‘act’, nn.ReLU(inplace=True))

void THNN_(Threshold_updateOutput)(THNNState *state,
THTensor *input,
THTensor *output,
accreal threshold_,
accreal val_,
bool inplace)
{
float threshold = (float)threshold_;
float val = (float)val_;
if (inplace)
{
int TH_TENSOR_APPLY_hasFinished = 0;
int64_t TH_TENSOR_dim_index = 0;
TH_TENSOR_APPLYX_PREAMBLE(float, input, -1, 0);
while (!TH_TENSORAPPLY_hasFinished)
{
for (; input_i < input_size; input_i++, input_data += input_stride)
{
if(*input_data <= threshold)
*input_data = val;
}
__TH_TENSOR_APPLYX_UPDATE_COUNTERS(TENSOR, 1);
}
THFree(input_counter);
THFloatTensor_set(output, input);
}
else
{
THFloatTensor_resizeAs(output, input);
TH_TENSOR_APPLYX_PREAMBLE(float, output, -1, 0);
if(output_n != input_n)
{
THDescBuff T1buff = _THSizeDesc(output->size, output->nDimension);
THDescBuff T2buff = _THSizeDesc(input->size, input->nDimension);
}
while (!TH_TENSOR_APPLY_hasFinished)
{
for (; output_i < output_size && input_i < input_size; output_i++,
input_++, output_data += output_stride, input_data += input_stride)
{
*output_data = (*input_data > threshold) ? *input_data : val;
}
__TH_TENSOR_APPLYX_UPDATE_COUNTERS(output, 0);
__TH_TENSOR_APPLYX_UPDATE_COUNTERS(input, 0);
}
if(output_counter != NULL)
THFree(output_counter);
if(input_counter != NULL)
}
}

void THNN_(Threshold_updateOutput)(THNNState *state,
THTensor *input,
THTensor *output,
accreal threshold_,
accreal val_,
bool inplace)
{
float threshold = (float)threshold_;
float val = (float)val_;
if (inplace)
{
while (!TH_TENSORAPPLY_hasFinished)
{
for (; input_i < input_size; input_i++, input_data += input_stride)
{
if(*input_data <= threshold)
*input_data = val;
}
__TH_TENSOR_APPLYX_UPDATE_COUNTERS(TENSOR, 1);
}
THFloatTensor_set(output, input);
}
else
{
THFloatTensor_resizeAs(output, input);
TH_TENSOR_APPLYX_PREAMBLE(float, output, -1, 0);
if(output_n != input_n)
{
THDescBuff T1buff = _THSizeDesc(output->size, output->nDimension);
THDescBuff T2buff = _THSizeDesc(input->size, input->nDimension);
}
while (!TH_TENSOR_APPLY_hasFinished)
{
for (; output_i < output_size && input_i < input_size; output_i++,
input_++, output_data += output_stride, input_data += input_stride)
{
*output_data = (*input_data > threshold) ? *input_data : val;
}
__TH_TENSOR_APPLYX_UPDATE_COUNTERS(output, 0);
__TH_TENSOR_APPLYX_UPDATE_COUNTERS(input, 0);
}
if(output_counter != NULL)
THFree(output_counter);
if(input_counter != NULL)
}
}
In-place Out of place

A
C
B
INPUT
Sigmoid(A)
Sigmoid(B)
D FPool(C) D + E
E
B
Pool(B)

Reference Count
A
C
B
INPUT
Sigmoid(A)
Sigmoid(B)
D FPool(C) D + E
E
B
Pool(B)
Node refcount
A 1
B 2
C 1
D 1
E 1
F 1

Mark Dirty
A
C
B
INPUT
Sigmoid(A)
Sigmoid(B)
D FPool(C) D + E
E
B
Pool(B)
Node refcount
A 1
B 2
C 1
D 1
E 1
F 1
If B is In-place operator,
mark_dirty( ) raises an error.

Memory sharing
A
B
INPUT
Sigmoid(A)
Sigmoid(B)
FPool(C) D + E
B
Pool(B)C
D
E

Memory sharing
A
B
INPUT
Sigmoid(A)
Sigmoid(B)
FPool(C) D + E
B
Pool(B)C
D
ERe-use
Release B after allocating C, E
Reuse for D
Memory sharing : Memory used by intermediate results that are no longer needed can be recycled and used in another node.

Memory sharing
A
B
INPUT
Sigmoid(A)
Sigmoid(B)
FPool(C) D + E
B
Pool(B)C
D
E
In-place
A
B
INPUT
Sigmoid(A)
Sigmoid(B)
FPool(C) D + E
B
Pool(B)C
D
E
In-place
OR A
B
INPUT
Sigmoid(A)
Sigmoid(B)
FPool(C) D + E
B
Pool(B)C
D
E
OR
Re-use

Trade Computation for Memory
• Apply normalization and non-linearities before/after the conv-operation.

• Convolution is most eﬃcient when input lies in a contiguous block of
memory

• To make a contiguous input, each layer must copy all previous features
(concatenation → mem-copy)

• Above operations are computationally extremely cheap
• Copying to pre-allocated memory is signiﬁcantly faster than
allocating new memory

Shared storage for concatenation
• Rather than allocating memory for each concatenation operation, assign
the outputs to a memory allocation shared across all layers

• Shared memory storage is used by all network layers, its data is not
permanent

• Need to be recomputed during back-propagation

Shared storage for batch normalization  
& non-linearity activation
• Assign the outputs of batch normalization / activation to a shared
memory allocation
• The data in shared memory storage is not permanent and will be
overwritten by the next layer

• Should recompute the batch normalization / activation outputs during
back-propagation

Re-computation
INPUT
conv-forward
bn-forward
relu-forward
conv-forward
bn-forward
relu-forward

Re-computation
INPUT
conv-forward
bn-forward
relu-forward
conv-forward
bn-forward
relu-forward
conv-backward
bn-backward
relu-backward
conv-backward
bn-backward
relu-backward
INPUT-Grad

Beneﬁt
• Can increase mini-batch size 
→ Speed up

• Build deeper model 
→ Accuracy up

• Can use deep model using small GPU 
→ Money up

Memory efficient pytorch

Recommended

Recommended

More Related Content

What's hot

What's hot (20)

Similar to Memory efficient pytorch

Similar to Memory efficient pytorch (20)

Recently uploaded

Recently uploaded (20)

Memory efficient pytorch