perfectism's blog

物来顺应,未来不迎,当时不杂,既往不恋

0%

使用Google Colab训练模型

使用Google Colab训练模型

挂载Google Drive并修改运行路径

方法一

1
2
3
4
5
6
7
8
9
10
11
12
!apt-get install -y -qq software-properties-common python-software-properties module-init-tools
!add-apt-repository -y ppa:alessandro-strada/ppa 2>&1 > /dev/null
!apt-get update -qq 2>&1 > /dev/null
!apt-get -y install -qq google-drive-ocamlfuse fuse
from google.colab import auth
auth.authenticate_user()
from oauth2client.client import GoogleCredentials
creds = GoogleCredentials.get_application_default()
import getpass
!google-drive-ocamlfuse -headless -id={creds.client_id} -secret={creds.client_secret} < /dev/null 2>&1 | grep URL
vcode = getpass.getpass()
!echo {vcode} | google-drive-ocamlfuse -headless -id={creds.client_id} -secret={creds.client_secret}
1
2
3
4
!mkdir -p drive
!google-drive-ocamlfuse -o nonempty drive
import os
os.chdir("drive/colab/resnet_20191204") #修改此处来修改colab在google drive中的运行路径

方法二

1
2
3
4
5
6
from google.colab import drive
drive.mount('/content/drive')

import os
os.chdir(r'/content/drive/My Drive/colab/resnet_20191204')
print(os.getcwd())

使用linux命令安装PyTorch

1
! pip3 install torch torchvision

用FashionMNIST训练ResNet

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
import time
import torch
import torchvision
from torch import nn,optim
import torch.nn.functional as F
import sys
sys.path.append("..") #添加当前文件夹为python解释器的模块搜索目录,写成sys.path.append("d2lzh_pytorch")也行
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(torch.__version__)
print(device)

class Residual(nn.Module): # 本类已保存在d2lzh_pytorch包中方便以后使用
def __init__(self, in_channels, out_channels, use_1x1conv=False, stride=1):
super(Residual, self).__init__()
self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1, stride=stride)
self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, padding=1)
if use_1x1conv:
self.conv3 = nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=stride)
else:
self.conv3 = None
self.bn1 = nn.BatchNorm2d(out_channels)
self.bn2 = nn.BatchNorm2d(out_channels)

def forward(self, X):
Y = F.relu(self.bn1(self.conv1(X)))
Y = self.bn2(self.conv2(Y))
if self.conv3:
X = self.conv3(X)
return F.relu(Y + X)

net = nn.Sequential(
nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3),
nn.BatchNorm2d(64),
nn.ReLU(),
nn.MaxPool2d(kernel_size=3, stride=2, padding=1))

def resnet_block(in_channels, out_channels, num_residuals, first_block=False):
if first_block:
assert in_channels == out_channels # 第一个模块的通道数同输入通道数一致
blk = []
for i in range(num_residuals):
if i == 0 and not first_block:
blk.append(Residual(in_channels, out_channels, use_1x1conv=True, stride=2))
else:
blk.append(Residual(out_channels, out_channels))
return nn.Sequential(*blk)

class GlobalAvgPool2d(nn.Module):
# 全局平均池化层可通过将池化窗口形状设置成输入的高和宽实现
def __init__(self):
super(GlobalAvgPool2d, self).__init__()
def forward(self, x):
return F.avg_pool2d(x, kernel_size=x.size()[2:])

class FlattenLayer(torch.nn.Module):
def __init__(self):
super(FlattenLayer, self).__init__()
def forward(self, x): # x shape: (batch, *, *, ...)
return x.view(x.shape[0], -1)

net.add_module("resnet_block1", resnet_block(64, 64, 2, first_block=True))
net.add_module("resnet_block2", resnet_block(64, 128, 2))
net.add_module("resnet_block3", resnet_block(128, 256, 2))
net.add_module("resnet_block4", resnet_block(256, 512, 2))
net.add_module("global_avg_pool", GlobalAvgPool2d()) # GlobalAvgPool2d的输出: (Batch, 512, 1, 1)
net.add_module("fc", nn.Sequential(FlattenLayer(), nn.Linear(512, 10)))

X = torch.rand((1, 1, 224, 224))
for name, layer in net.named_children():
X = layer(X)
print(name, ' output shape:\t', X.shape) #第二个通道数,最后两个为高和宽

def load_data_fashion_mnist(batch_size, resize=None, root='~/Datasets/FashionMNIST'):
"""Download the fashion mnist dataset and then load into memory."""
trans = []
if resize:
trans.append(torchvision.transforms.Resize(size=resize))
trans.append(torchvision.transforms.ToTensor())

transform = torchvision.transforms.Compose(trans)
mnist_train = torchvision.datasets.FashionMNIST(root=root, train=True, download=True, transform=transform)
mnist_test = torchvision.datasets.FashionMNIST(root=root, train=False, download=True, transform=transform)
if sys.platform.startswith('win'):
num_workers = 0 # 0表示不用额外的进程来加速读取数据
else:
num_workers = 4
train_iter = torch.utils.data.DataLoader(mnist_train, batch_size=batch_size, shuffle=True, num_workers=num_workers)
test_iter = torch.utils.data.DataLoader(mnist_test, batch_size=batch_size, shuffle=False, num_workers=num_workers)

return train_iter, test_iter

def evaluate_accuracy(data_iter, net, device=None):
if device is None and isinstance(net, torch.nn.Module):
# 如果没指定device就使用net的device
device = list(net.parameters())[0].device
acc_sum, n = 0.0, 0
with torch.no_grad():
for X, y in data_iter:
if isinstance(net, torch.nn.Module):
net.eval() # 评估模式, 这会关闭dropout
acc_sum += (net(X.to(device)).argmax(dim=1) == y.to(device)).float().sum().cpu().item()
net.train() # 改回训练模式
else: # 自定义的模型, 3.13节之后不会用到, 不考虑GPU
if('is_training' in net.__code__.co_varnames): # 如果有is_training这个参数
# 将is_training设置成False
acc_sum += (net(X, is_training=False).argmax(dim=1) == y).float().sum().item()
else:
acc_sum += (net(X).argmax(dim=1) == y).float().sum().item()
n += y.shape[0]
return acc_sum / n

def train_ch5(net, train_iter, test_iter, batch_size, optimizer, device, num_epochs):
net = net.to(device)
print("training on ", device)
loss = torch.nn.CrossEntropyLoss()
batch_count = 0
for epoch in range(num_epochs):
train_l_sum, train_acc_sum, n, start = 0.0, 0.0, 0, time.time()
for X, y in train_iter:
X = X.to(device)
y = y.to(device)
y_hat = net(X)
l = loss(y_hat, y)
optimizer.zero_grad()
l.backward()
optimizer.step()
train_l_sum += l.cpu().item()
train_acc_sum += (y_hat.argmax(dim=1) == y).sum().cpu().item()
n += y.shape[0]
batch_count += 1
test_acc = evaluate_accuracy(test_iter, net)
print('epoch %d, loss %.4f, train acc %.3f, test acc %.3f, time %.1f sec'
% (epoch + 1, train_l_sum / batch_count, train_acc_sum / n, test_acc, time.time() - start))

batch_size = 256
# 如出现“out of memory”的报错信息,可减小batch_size或resize
train_iter, test_iter = load_data_fashion_mnist(batch_size, resize=96)

lr, num_epochs = 0.001, 5
optimizer = torch.optim.Adam(net.parameters(), lr=lr)
train_ch5(net, train_iter, test_iter, batch_size, optimizer, device, num_epochs)
1.3.1
cuda
0 output shape: torch.Size([1, 64, 112, 112])
1 output shape: torch.Size([1, 64, 112, 112])
2 output shape: torch.Size([1, 64, 112, 112])
3 output shape: torch.Size([1, 64, 56, 56])
resnet_block1 output shape: torch.Size([1, 64, 56, 56])
resnet_block2 output shape: torch.Size([1, 128, 28, 28])
resnet_block3 output shape: torch.Size([1, 256, 14, 14])
resnet_block4 output shape: torch.Size([1, 512, 7, 7])
global_avg_pool output shape: torch.Size([1, 512, 1, 1])
fc output shape: torch.Size([1, 10])
training on cuda
epoch 1, loss 0.4035, train acc 0.852, test acc 0.892, time 28.7 sec
epoch 2, loss 0.1233, train acc 0.908, test acc 0.904, time 28.5 sec
epoch 3, loss 0.0698, train acc 0.922, test acc 0.909, time 28.5 sec
epoch 4, loss 0.0446, train acc 0.934, test acc 0.916, time 28.5 sec
epoch 5, loss 0.0306, train acc 0.943, test acc 0.905, time 28.5 sec

下面是在自己电脑(GTX-960M)上跑一个batch的数据,整整快了11倍多

1
2
training on  cuda
epoch 1, loss 0.3956, train acc 0.854, test acc 0.888, time 325.2 sec

免费的GPU算力

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
!nvidia-smi
Wed Dec 4 12:29:39 2019
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 440.33.01 Driver Version: 418.67 CUDA Version: 10.1 |
|-------------------------------+----------------------+----------------------+
| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |
|===============================+======================+======================|
| 0 Tesla P100-PCIE... Off | 00000000:00:04.0 Off | 0 |
| N/A 38C P0 34W / 250W | 2697MiB / 16280MiB | 0% Default |
+-------------------------------+----------------------+----------------------+

+-----------------------------------------------------------------------------+
| Processes: GPU Memory |
| GPU PID Type Process name Usage |
|=============================================================================|
+-----------------------------------------------------------------------------+

经过两次测验分别分配的是 Tesla K80 和 Tesla P100

随机查看一个Batch的数据

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
import matplotlib.pyplot as plt
import numpy as np
import torchvision

# functions to show an image


def imshow(img):
img = img / 2 + 0.5 # unnormalize
npimg = img.cpu().numpy()
plt.imshow(np.transpose(npimg, (1, 2, 0)))
plt.show()

classes = ('T-shirt/top', 'Trouser', 'Pullover', 'Dress',
'Coat', 'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle boot')
# get some random training images
dataiter = iter(train_iter)
images, labels = dataiter.next()

# show images
imshow(torchvision.utils.make_grid(images))
# print labels
print(' '.join('%5s' % classes[labels[j]] for j in range(4)))

随机选取一个Batch的数据测试

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
PATH = './resnet_20191204.pth'
torch.save(net.state_dict(), PATH)

dataiter = iter(test_iter)
images, labels = dataiter.next() #通过next和iter迭代来获取一个批次的装载数据
images, labels = images.to(device), labels.to(device)

# print images
imshow(torchvision.utils.make_grid(images))
print('GroundTruth: ', ' '.join('%5s' % classes[labels[j]] for j in range(4)))
#net = Net()
net.load_state_dict(torch.load(PATH))
outputs = net(images)
_, predicted = torch.max(outputs, 1)

print('Predicted: ', ' '.join('%5s' % classes[predicted[j]]
for j in range(4)))
GroundTruth: Ankle boot Pullover Trouser Trouser
Predicted: Ankle boot Pullover Trouser Trouser

在kaggle中使用tensorboardcolab

Tensorboardcolab使用笔记

1
2
3
4
5
6
7
8
9
!pip install tensorboardcolab
from tensorboardcolab import TensorBoardColab

tb = TensorBoardColab() #初始化类

#训练循环里写明
writer.save_value('Train Loss', 'train_loss', epoch, epoch_loss)
writer.save_value('Train Accuracy', 'train_loss', epoch, epoch_acc)
tb.flush_line('train_loss')

core of Tensorboardcolab

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import tensorflow as tf
from keras.callbacks import TensorBoard
import time
import os
import io

class TensorBoardColab:
def __init__(self, port=6006, graph_path='./Graph', startup_waiting_time=8):
self.port = port
self.graph_path = graph_path
self.writer = None
self.deep_writers = {}
self.eager_execution = None
get_ipython().system_raw('npm i -s -q --unsafe-perm -g ngrok') # sudo npm i -s -q --unsafe-perm -g ngrok

setup_passed = False
retry_count = 0
sleep_time = startup_waiting_time / 3.0
while not setup_passed:
get_ipython().system_raw('kill -9 $(sudo lsof -t -i:%d)' % port)
get_ipython().system_raw('rm -Rf ' + graph_path)
print('Wait for %d seconds...' % startup_waiting_time)
time.sleep(sleep_time)
get_ipython().system_raw('tensorboard --logdir %s --host 0.0.0.0 --port %d &' % (graph_path, port))
time.sleep(sleep_time)
get_ipython().system_raw('ngrok http %d &' % port)
time.sleep(sleep_time)
try:
tensorboard_link = get_ipython().getoutput(
'curl -s http://localhost:4040/api/tunnels | python3 -c "import sys, json; print(json.load(sys.stdin))"')[
0]
tensorboard_link = eval(tensorboard_link)['tunnels'][0]['public_url']
setup_passed = True
except:
setup_passed = False
retry_count += 1
print('Initialization failed, retry again (%d)' % retry_count)
print('\n')

print("TensorBoard link:")
print(tensorboard_link)

def get_graph_path(self):
return self.graph_path

def is_eager_execution(self):
if self.eager_execution is None:
try:
tf.summary.FileWriter(self.graph_path)
self.eager_execution = False
except Exception as err:
self.eager_execution = str(
err) == 'tf.summary.FileWriter is not compatible with eager execution. Use tf.contrib.summary instead.'
return self.eager_execution

def get_writer(self):
if self.writer is None:
if self.is_eager_execution():
self.writer = tf.contrib.summary.create_file_writer(self.graph_path)
else:
self.writer = tf.summary.FileWriter(self.graph_path)

return self.writer

def get_deep_writers(self, name):
if not (name in self.deep_writers):
log_path = os.path.join(self.graph_path, name)
if self.is_eager_execution():
self.deep_writers[name] = tf.contrib.summary.create_file_writer(log_path)
else:
self.deep_writers[name] = tf.summary.FileWriter(log_path)
return self.deep_writers[name]

def save_image(self, title, image):
image_path = os.path.join(self.graph_path, 'images')
if self.is_eager_execution():
print('Warning: save_image() is not supported in eager execution mode')
# writer = tf.contrib.summary.create_file_writer(image_path)
# writer.set_as_default()
# with tf.contrib.summary.always_record_summaries():
# tf.contrib.summary.image(
# title,
# image_tensor
# )
else:
summary_op = tf.summary.image(title, image)
with tf.Session() as sess:
summary = sess.run(summary_op)
writer = tf.summary.FileWriter(image_path)
writer.add_summary(summary)
writer.close()

def save_value(self, graph_name, line_name, epoch, value):
if self.is_eager_execution():
self.get_deep_writers(line_name).set_as_default()
global_step = tf.train.get_or_create_global_step()
global_step.assign(epoch)
with tf.contrib.summary.always_record_summaries():
tf.contrib.summary.scalar(graph_name, value)
else:
summary = tf.Summary()
summary_value = summary.value.add()
summary_value.simple_value = value
summary_value.tag = graph_name
self.get_deep_writers(line_name).add_summary(summary, epoch)

def flush_line(self, line_name):
self.get_deep_writers(line_name).flush()

def close(self):
if self.writer is not None:
self.writer.close()
self.writer = None
for key in self.deep_writers:
self.deep_writers[key].close()
self.deep_writers = {}

cv2.imshow碰到cannot connect to X server

X server是Linux系统上提供图形用户界面的服务程序。当客户端主机Client访问服务器Server上的图形程序时,需要Server对该Client赋能访问图形程序的权限

OpenCV采用highgui,而命令行下无法产生图形界面

改用matplotlib

1
2
3
4
5
6
7
8
9
import cv2
from matplotlib import pyplot as plt
import numpy as np

original_image = 'a.jpg'
image = cv2.imread(original_image)
show_img = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
plt.imshow(show_img)
plt.show()

在线编辑py文件

anyfile-notepad