使用Google Colab训练模型

挂载Google Drive并修改运行路径

方法一

!apt-get install -y -qq software-properties-common python-software-properties module-init-tools
!add-apt-repository -y ppa:alessandro-strada/ppa 2>&1 > /dev/null
!apt-get update -qq 2>&1 > /dev/null
!apt-get -y install -qq google-drive-ocamlfuse fuse
from google.colab import auth
auth.authenticate_user()
from oauth2client.client import GoogleCredentials
creds = GoogleCredentials.get_application_default()
import getpass
!google-drive-ocamlfuse -headless -id={creds.client_id} -secret={creds.client_secret} < /dev/null 2>&1 | grep URL
vcode = getpass.getpass()
!echo {vcode} | google-drive-ocamlfuse -headless -id={creds.client_id} -secret={creds.client_secret}

!mkdir -p drive
!google-drive-ocamlfuse -o nonempty drive
import os
os.chdir("drive/colab/resnet_20191204") #修改此处来修改colab在google drive中的运行路径

方法二

from google.colab import drive
drive.mount('/content/drive')

import os
os.chdir(r'/content/drive/My Drive/colab/resnet_20191204')
print(os.getcwd())

使用linux命令安装PyTorch

1	! pip3 install torch torchvision

用FashionMNIST训练ResNet

import time
import torch
import torchvision
from torch import nn,optim
import torch.nn.functional as F
import sys
sys.path.append("..")  #添加当前文件夹为python解释器的模块搜索目录，写成sys.path.append("d2lzh_pytorch")也行 
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(torch.__version__)
print(device)

class Residual(nn.Module):  # 本类已保存在d2lzh_pytorch包中方便以后使用
    def __init__(self, in_channels, out_channels, use_1x1conv=False, stride=1):
        super(Residual, self).__init__()
        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1, stride=stride)
        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, padding=1)
        if use_1x1conv:
            self.conv3 = nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=stride)
        else:
            self.conv3 = None
        self.bn1 = nn.BatchNorm2d(out_channels)
        self.bn2 = nn.BatchNorm2d(out_channels)

    def forward(self, X):
        Y = F.relu(self.bn1(self.conv1(X)))
        Y = self.bn2(self.conv2(Y))
        if self.conv3:
            X = self.conv3(X)
        return F.relu(Y + X)

net = nn.Sequential(
        nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3),
        nn.BatchNorm2d(64), 
        nn.ReLU(),
        nn.MaxPool2d(kernel_size=3, stride=2, padding=1))

def resnet_block(in_channels, out_channels, num_residuals, first_block=False):
    if first_block:
        assert in_channels == out_channels # 第一个模块的通道数同输入通道数一致
    blk = []
    for i in range(num_residuals):
        if i == 0 and not first_block:
            blk.append(Residual(in_channels, out_channels, use_1x1conv=True, stride=2))
        else:
            blk.append(Residual(out_channels, out_channels))
    return nn.Sequential(*blk)

class GlobalAvgPool2d(nn.Module):
    # 全局平均池化层可通过将池化窗口形状设置成输入的高和宽实现
    def __init__(self):
        super(GlobalAvgPool2d, self).__init__()
    def forward(self, x):
        return F.avg_pool2d(x, kernel_size=x.size()[2:])

class FlattenLayer(torch.nn.Module):
    def __init__(self):
        super(FlattenLayer, self).__init__()
    def forward(self, x): # x shape: (batch, *, *, ...)
        return x.view(x.shape[0], -1)

net.add_module("resnet_block1", resnet_block(64, 64, 2, first_block=True))
net.add_module("resnet_block2", resnet_block(64, 128, 2))
net.add_module("resnet_block3", resnet_block(128, 256, 2))
net.add_module("resnet_block4", resnet_block(256, 512, 2))
net.add_module("global_avg_pool", GlobalAvgPool2d()) # GlobalAvgPool2d的输出: (Batch, 512, 1, 1)
net.add_module("fc", nn.Sequential(FlattenLayer(), nn.Linear(512, 10))) 

X = torch.rand((1, 1, 224, 224))
for name, layer in net.named_children():
    X = layer(X)
    print(name, ' output shape:\t', X.shape) #第二个通道数，最后两个为高和宽

def load_data_fashion_mnist(batch_size, resize=None, root='~/Datasets/FashionMNIST'):
    """Download the fashion mnist dataset and then load into memory."""
    trans = []
    if resize:
        trans.append(torchvision.transforms.Resize(size=resize))
    trans.append(torchvision.transforms.ToTensor())
    
    transform = torchvision.transforms.Compose(trans)
    mnist_train = torchvision.datasets.FashionMNIST(root=root, train=True, download=True, transform=transform)
    mnist_test = torchvision.datasets.FashionMNIST(root=root, train=False, download=True, transform=transform)
    if sys.platform.startswith('win'):
        num_workers = 0  # 0表示不用额外的进程来加速读取数据
    else:
        num_workers = 4
    train_iter = torch.utils.data.DataLoader(mnist_train, batch_size=batch_size, shuffle=True, num_workers=num_workers)
    test_iter = torch.utils.data.DataLoader(mnist_test, batch_size=batch_size, shuffle=False, num_workers=num_workers)

    return train_iter, test_iter

def evaluate_accuracy(data_iter, net, device=None):
    if device is None and isinstance(net, torch.nn.Module):
        # 如果没指定device就使用net的device
        device = list(net.parameters())[0].device 
    acc_sum, n = 0.0, 0
    with torch.no_grad():
        for X, y in data_iter:
            if isinstance(net, torch.nn.Module):
                net.eval() # 评估模式, 这会关闭dropout
                acc_sum += (net(X.to(device)).argmax(dim=1) == y.to(device)).float().sum().cpu().item()
                net.train() # 改回训练模式
            else: # 自定义的模型, 3.13节之后不会用到, 不考虑GPU
                if('is_training' in net.__code__.co_varnames): # 如果有is_training这个参数
                    # 将is_training设置成False
                    acc_sum += (net(X, is_training=False).argmax(dim=1) == y).float().sum().item() 
                else:
                    acc_sum += (net(X).argmax(dim=1) == y).float().sum().item() 
            n += y.shape[0]
    return acc_sum / n

def train_ch5(net, train_iter, test_iter, batch_size, optimizer, device, num_epochs):
    net = net.to(device)
    print("training on ", device)
    loss = torch.nn.CrossEntropyLoss()
    batch_count = 0
    for epoch in range(num_epochs):
        train_l_sum, train_acc_sum, n, start = 0.0, 0.0, 0, time.time()
        for X, y in train_iter:
            X = X.to(device)
            y = y.to(device)
            y_hat = net(X)
            l = loss(y_hat, y)
            optimizer.zero_grad()
            l.backward()
            optimizer.step()
            train_l_sum += l.cpu().item()
            train_acc_sum += (y_hat.argmax(dim=1) == y).sum().cpu().item()
            n += y.shape[0]
            batch_count += 1
        test_acc = evaluate_accuracy(test_iter, net)
        print('epoch %d, loss %.4f, train acc %.3f, test acc %.3f, time %.1f sec'
              % (epoch + 1, train_l_sum / batch_count, train_acc_sum / n, test_acc, time.time() - start))

batch_size = 256
# 如出现“out of memory”的报错信息，可减小batch_size或resize
train_iter, test_iter = load_data_fashion_mnist(batch_size, resize=96)

lr, num_epochs = 0.001, 5
optimizer = torch.optim.Adam(net.parameters(), lr=lr)
train_ch5(net, train_iter, test_iter, batch_size, optimizer, device, num_epochs)
1.3.1
cuda
0  output shape:     torch.Size([1, 64, 112, 112])
1  output shape:     torch.Size([1, 64, 112, 112])
2  output shape:     torch.Size([1, 64, 112, 112])
3  output shape:     torch.Size([1, 64, 56, 56])
resnet_block1  output shape:     torch.Size([1, 64, 56, 56])
resnet_block2  output shape:     torch.Size([1, 128, 28, 28])
resnet_block3  output shape:     torch.Size([1, 256, 14, 14])
resnet_block4  output shape:     torch.Size([1, 512, 7, 7])
global_avg_pool  output shape:     torch.Size([1, 512, 1, 1])
fc  output shape:     torch.Size([1, 10])
training on  cuda
epoch 1, loss 0.4035, train acc 0.852, test acc 0.892, time 28.7 sec
epoch 2, loss 0.1233, train acc 0.908, test acc 0.904, time 28.5 sec
epoch 3, loss 0.0698, train acc 0.922, test acc 0.909, time 28.5 sec
epoch 4, loss 0.0446, train acc 0.934, test acc 0.916, time 28.5 sec
epoch 5, loss 0.0306, train acc 0.943, test acc 0.905, time 28.5 sec

下面是在自己电脑（GTX-960M）上跑一个batch的数据，整整快了11倍多

1 2	training on cuda epoch 1, loss 0.3956, train acc 0.854, test acc 0.888, time 325.2 sec

免费的GPU算力

!nvidia-smi
Wed Dec  4 12:29:39 2019       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 440.33.01    Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|===============================+======================+======================|
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   38C    P0    34W / 250W |   2697MiB / 16280MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+

+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
|=============================================================================|
+-----------------------------------------------------------------------------+

经过两次测验分别分配的是 Tesla K80 和 Tesla P100

随机查看一个Batch的数据

import matplotlib.pyplot as plt
import numpy as np
import torchvision

# functions to show an image


def imshow(img):
    img = img / 2 + 0.5     # unnormalize
    npimg = img.cpu().numpy()
    plt.imshow(np.transpose(npimg, (1, 2, 0)))
    plt.show()

classes = ('T-shirt/top', 'Trouser', 'Pullover', 'Dress',
           'Coat', 'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle boot')
# get some random training images
dataiter = iter(train_iter)
images, labels = dataiter.next()

# show images
imshow(torchvision.utils.make_grid(images))
# print labels
print(' '.join('%5s' % classes[labels[j]] for j in range(4)))

随机选取一个Batch的数据测试

PATH = './resnet_20191204.pth'
torch.save(net.state_dict(), PATH)

dataiter = iter(test_iter)
images, labels = dataiter.next() #通过next和iter迭代来获取一个批次的装载数据
images, labels = images.to(device), labels.to(device)

# print images
imshow(torchvision.utils.make_grid(images))
print('GroundTruth: ', ' '.join('%5s' % classes[labels[j]] for j in range(4)))
#net = Net()
net.load_state_dict(torch.load(PATH))
outputs = net(images)
_, predicted = torch.max(outputs, 1)

print('Predicted: ', ' '.join('%5s' % classes[predicted[j]]
                              for j in range(4)))
GroundTruth:  Ankle boot Pullover Trouser Trouser
Predicted:  Ankle boot Pullover Trouser Trouser

在kaggle中使用tensorboardcolab

Tensorboardcolab使用笔记

!pip install tensorboardcolab
from tensorboardcolab import TensorBoardColab

tb = TensorBoardColab() #初始化类

#训练循环里写明
writer.save_value('Train Loss', 'train_loss', epoch, epoch_loss)
writer.save_value('Train Accuracy', 'train_loss', epoch, epoch_acc)
tb.flush_line('train_loss')

core of Tensorboardcolab

import tensorflow as tf
from keras.callbacks import TensorBoard
import time
import os
import io

class TensorBoardColab:
    def __init__(self, port=6006, graph_path='./Graph', startup_waiting_time=8):
        self.port = port
        self.graph_path = graph_path
        self.writer = None
        self.deep_writers = {}
        self.eager_execution = None
        get_ipython().system_raw('npm i -s -q --unsafe-perm -g ngrok')  # sudo npm i -s -q --unsafe-perm -g ngrok

        setup_passed = False
        retry_count = 0
        sleep_time = startup_waiting_time / 3.0
        while not setup_passed:
            get_ipython().system_raw('kill -9 $(sudo lsof -t -i:%d)' % port)
            get_ipython().system_raw('rm -Rf ' + graph_path)
            print('Wait for %d seconds...' % startup_waiting_time)
            time.sleep(sleep_time)
            get_ipython().system_raw('tensorboard --logdir %s --host 0.0.0.0 --port %d &' % (graph_path, port))
            time.sleep(sleep_time)
            get_ipython().system_raw('ngrok http %d &' % port)
            time.sleep(sleep_time)
            try:
                tensorboard_link = get_ipython().getoutput(
                    'curl -s http://localhost:4040/api/tunnels | python3 -c "import sys, json; print(json.load(sys.stdin))"')[
                    0]
                tensorboard_link = eval(tensorboard_link)['tunnels'][0]['public_url']
                setup_passed = True
            except:
                setup_passed = False
                retry_count += 1
                print('Initialization failed, retry again (%d)' % retry_count)
                print('\n')

        print("TensorBoard link:")
        print(tensorboard_link)

    def get_graph_path(self):
        return self.graph_path

    def is_eager_execution(self):
        if self.eager_execution is None:
            try:
                tf.summary.FileWriter(self.graph_path)
                self.eager_execution = False
            except Exception as err:
                self.eager_execution = str(
                    err) == 'tf.summary.FileWriter is not compatible with eager execution. Use tf.contrib.summary instead.'
        return self.eager_execution

    def get_writer(self):
        if self.writer is None:
            if self.is_eager_execution():
                self.writer = tf.contrib.summary.create_file_writer(self.graph_path)
            else:
                self.writer = tf.summary.FileWriter(self.graph_path)

        return self.writer

    def get_deep_writers(self, name):
        if not (name in self.deep_writers):
            log_path = os.path.join(self.graph_path, name)
            if self.is_eager_execution():
                self.deep_writers[name] = tf.contrib.summary.create_file_writer(log_path)
            else:
                self.deep_writers[name] = tf.summary.FileWriter(log_path)
        return self.deep_writers[name]

    def save_image(self, title, image):
        image_path = os.path.join(self.graph_path, 'images')
        if self.is_eager_execution():
            print('Warning: save_image() is not supported in eager execution mode')
        #           writer = tf.contrib.summary.create_file_writer(image_path)
        #           writer.set_as_default()
        #           with tf.contrib.summary.always_record_summaries():
        #               tf.contrib.summary.image(
        #                   title,
        #                   image_tensor
        #               )
        else:
            summary_op = tf.summary.image(title, image)
            with tf.Session() as sess:
                summary = sess.run(summary_op)
                writer = tf.summary.FileWriter(image_path)
                writer.add_summary(summary)
                writer.close()

    def save_value(self, graph_name, line_name, epoch, value):
        if self.is_eager_execution():
            self.get_deep_writers(line_name).set_as_default()
            global_step = tf.train.get_or_create_global_step()
            global_step.assign(epoch)
            with tf.contrib.summary.always_record_summaries():
                tf.contrib.summary.scalar(graph_name, value)
        else:
            summary = tf.Summary()
            summary_value = summary.value.add()
            summary_value.simple_value = value
            summary_value.tag = graph_name
            self.get_deep_writers(line_name).add_summary(summary, epoch)

    def flush_line(self, line_name):
        self.get_deep_writers(line_name).flush()

    def close(self):
        if self.writer is not None:
            self.writer.close()
            self.writer = None
        for key in self.deep_writers:
            self.deep_writers[key].close()
        self.deep_writers = {}

cv2.imshow碰到cannot connect to X server

X server是Linux系统上提供图形用户界面的服务程序。当客户端主机Client访问服务器Server上的图形程序时，需要Server对该Client赋能访问图形程序的权限

OpenCV采用highgui，而命令行下无法产生图形界面

改用matplotlib

import cv2
from matplotlib import pyplot as plt
import numpy as np

original_image = 'a.jpg'
image = cv2.imread(original_image)
show_img = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
plt.imshow(show_img)
plt.show()

在线编辑py文件

anyfile-notepad