基于SVM的图像识别实战

注意:本文最初使用jupyter notebook编写,后经程序转换为markdown,所以格式可能有多处错误,懒得修改了。

前公司曾经做过一个宣传活动,大概的流程是在一个屋子里面“寻宝”,宝藏是公司的“logo”,找到后用手机扫logo就会得到相应的奖励,效果类似支付宝春节扫“福”。

其实这个项目的原理很简单,只需不停的用摄像头的图片调用一个图像识别接口来判断图片是否有logo,有的话就弹出奖励。我对这个项目印象还是蛮深的,因为我就是这个项目的PHP后端,不过当时对机器学习还不了解,现在凭印象对这个项目的效果进行复现。


import matplotlib.pyplot as plt
import cv2
import numpy as np
import time
import pickle
import os
import sys
import random
import glob
from skimage.feature import hog

from sklearn.svm import LinearSVC
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train\_test\_split

%matplotlib inline

准备训练数据

先来看一下我们要识别的logo,英语流利说,我每天都在用的一个app(流利说的领导看到后,记得把广告费打给我😏)


logo = cv2.imread('logo/logo.jpg')
logo = cv2.resize(logo,(50,50))
plt.imshow(cv2.cvtColor(logo, cv2.COLOR_BGR2RGB))

image

因为我们不确定用户扫码logo时的背景,比如用户在室内扫logo的话背景就有可能是办公室,放在桌子上扫的话背景有可能是办公用品,logo只有一张,所以需要“制造”许多含有logo的图片作为训练数据。

先随便选择一张图片作为背景:

img = cv2.imread('logo/office/000082090_piclink.jpg')
print(img.shape)
demo = cv2.resize(img,(100,100))
plt.imshow(cv2.cvtColor(demo, cv2.COLOR_BGR2RGB))

image

将logo嵌入进去。


def marge2img(img1,img2,threshold=1):
    l1,w1,h1 = img1.shape
    l2,w2,h2 = img2.shape
    if h1 != h2:
        raise '图片位深必须一致'
        
    if l1 > l2 and w1>w2:
        pass
    elif l2>l1 and w2>w1:
        img1,img2 = img2,img1
        l1,l2,w1,w2 = l2,l1,w2,w1
    else:
        raise '图片必须一大一小'
        
    offset1 = random.randint(0,l2)
    offset2 = random.randint(0,w2)
    #print(img1.shape,img2.shape,offset1,offset2)
    for i in range(l2):
        for j in range(w2):
            if img2\[i\]\[j\]\[0\]>threshold and img2\[i\]\[j\]\[1\]>threshold and img2\[i\]\[j\]\[2\]>threshold:
                img1\[offset1+i\]\[offset2+j\] = img2\[i\]\[j\]

In \[5\]:

marge2img(demo,logo,2)
cv2.imwrite('logo/demo.jpg',demo)
plt.imshow(cv2.cvtColor(demo, cv2.COLOR_BGR2RGB))

image

下面批量处理背景图片,制造更多的训练数据。一半背景嵌入logo,另一半不嵌入logo。


office = glob.glob('logo/office/*.jpg')
phototheme = glob.glob('logo/phototheme/*.jpg')
destop = glob.glob('logo/destop/*.jpg')

background = office+phototheme+destop

random.shuffle(background)
embed = background\[:int(len(background)/2)\]
unembed = background\[int(len(background)/2):\]

for picpath in embed:
    img = cv2.imread(picpath)
    if img is None:
        print(picpath)
        continue
    if len(img.shape) <3:
        print(picpath,img.shape)
        continue
    img = cv2.resize(img,(100,100))\[:,:,:3\]
    marge2img(img,logo,2)
    cv2.imwrite('logo/embedLogo/'+picpath.split('/')\[-1:\]\[0\],img)
    
for picpath in unembed:
    img = cv2.imread(picpath)
    if img is None:
        print(picpath)
        continue
    if len(img.shape) <3:
        print(picpath,img.shape)
        continue
    img = cv2.resize(img,(100,100))\[:,:,:3\]
    cv2.imwrite('logo/unembedLogo/'+picpath.split('/')\[-1:\]\[0\],img)

fig, axes = plt.subplots(2, 2, figsize=(8, 8))
aslist = axes.ravel()

demos = glob.glob('logo/embedLogo/*.jpg')
random.shuffle(demos)
for key,path in enumerate(demos\[:4\]):
    img = plt.imread(path)
    aslist\[key\].imshow(img)

image

数据增强

嵌入的logo都很“正”,但是用户拍摄的时候可能会有各种角度,我们可以使用如镜像,缩放,随机裁剪,随机变换等方式生成不同角度的图片,丰富训练集。


def rotate(img,degree):
    num_rows, num_cols = img.shape\[:2\]
    rotation_matrix = cv2.getRotationMatrix2D((num_cols/2, num_rows/2), degree, 1)
    img_rotation = cv2.warpAffine(img, rotation_matrix, (num_cols, num_rows))
    return img_rotation

def affinetransformation(img, reverse = False):
    rows, cols = img.shape\[:2\]
    
    if reverse:
        src_points = np.float32(\[\[0, 0\], \[cols - 1, 0\], \[cols - 1, rows - 1\]\])
        dst_points = np.float32(\[\[int(0.2*(cols - 1)), 0\], \[cols - 1, 0\],
                                 \[int(0.8*(cols - 1)), rows - 1\]\])
    else:
        src_points = np.float32(\[\[0, 0\], \[cols - 1, 0\], \[0, rows - 1\]\])
        dst_points = np.float32(\[\[0, 0\], \[int(0.8*(cols - 1)), 0\],
                                 \[int(0.2*(cols - 1)), rows - 1\]\])
    affine_matrix = cv2.getAffineTransform(src_points, dst_points)
    img_output = cv2.warpAffine(img, affine_matrix, (cols, rows))
    return img_output

fig, axes = plt.subplots(1, 5, figsize=(8*5, 8))
aslist = axes.ravel()

aslist\[0\].imshow(cv2.cvtColor(rotate(demo,random.randint(-30,30)), cv2.COLOR_BGR2RGB))
aslist\[1\].imshow(cv2.cvtColor(rotate(demo,random.randint(-30,30)), cv2.COLOR_BGR2RGB))
aslist\[2\].imshow(cv2.cvtColor(affinetransformation(demo,True), cv2.COLOR_BGR2RGB))
aslist\[3\].imshow(cv2.cvtColor(affinetransformation(demo), cv2.COLOR_BGR2RGB))
aslist\[4\].imshow(cv2.cvtColor(cv2.flip(demo,1), cv2.COLOR_BGR2RGB))

image

特征提取

如果是深度学习,直接将图片的原始数据输入到神经网络就可以了,特征提取的工作可以交给神经网络来做。但是如果是用传统的机器学习算法,就需要我们”制造“一些特征来帮助模型更好的工作。

首先图片的原始特征肯定是必须要的,将原始特征转换为一维的:


def bin_spatial(img, size=(32, 32)):
    color1 = cv2.resize(img\[:,:,0\], size).ravel()
    color2 = cv2.resize(img\[:,:,1\], size).ravel()
    color3 = cv2.resize(img\[:,:,2\], size).ravel()
    return np.hstack((color1, color2, color3))

spatial_features = bin_spatial(demo, size=(32,32))
print(spatial_features.shape)

(3072,)

我们可以对图片的每个色值进行统计,也许会有用呢?


def color_hist(img, nbins=32):
    channel1_hist = np.histogram(img\[:,:,0\], bins=nbins)
    channel2_hist = np.histogram(img\[:,:,1\], bins=nbins)
    channel3_hist = np.histogram(img\[:,:,2\], bins=nbins)
    hist_features = np.concatenate((channel1_hist\[0\], channel2_hist\[0\], channel3_hist\[0\]))
    return hist_features

取一张有logo的看看统计结果:


haslogo = plt.imread(glob.glob('logo/embedLogo/*.jpg')\[0\])

hist_features = color_hist(haslogo, nbins=32)

fig, axes = plt.subplots(1, 3, figsize=(16,8))
aslist = axes.ravel()

for i in range(3):
    aslist\[i\].bar(\[i*(255/32) for i in range(32)\],hist_features\[i*32:(i+1)*32\])

image

再取一张没有logo的看看统计结果:


nologo = plt.imread(glob.glob('logo/unembedLogo/*.jpg')\[0\])

hist_features = color_hist(nologo, nbins=32)

fig, axes = plt.subplots(1, 3, figsize=(16,8))
aslist = axes.ravel()

for i in range(3):
    aslist\[i\].bar(\[i*(255/32) for i in range(32)\],hist_features\[i*32:(i+1)*32\])

image

额,看不出什么差别。。。不过直觉告诉我们应该是有差别的,因为logo的绿色比较多。希望模型可以分辨其中的不同。

下面我们来提取方向梯度直方图,这是目标检测常用的特征:


orient = 15  \# HOG orientations
pix\_per\_cell = 8 \# HOG pixels per cell
cell\_per\_block = 2 \# HOG cells per block

fig, axes = plt.subplots(1, 3, figsize=(16,8))
aslist = axes.ravel()

for i in range(3):
    _,pic = hog(haslogo\[:,:,i\], orientations=orient,
                   pixels\_per\_cell=(pix\_per\_cell, pix\_per\_cell), 
                   cells\_per\_block=(cell\_per\_block, cell\_per\_block),
                   block_norm= 'L1', transform_sqrt=False,
                   visualize=True, feature_vector=False)
    aslist\[i\].imshow(pic)

image

有logo模糊的影子。。。看到没有。


def hog_image(feature_image,orient,pix\_per\_cell,cell\_per\_block):
    ch1 = feature_image\[:,:,0\]
    ch2 = feature_image\[:,:,1\]
    ch3 = feature_image\[:,:,2\]
    hog1 = hog(ch1, orientations=orient,
               pixels\_per\_cell=(pix\_per\_cell, pix\_per\_cell), 
               cells\_per\_block=(cell\_per\_block, cell\_per\_block),
               block_norm= 'L1', transform_sqrt=False,
               visualize=False, feature_vector=False)
    hog2 = hog(ch2, orientations=orient,
               pixels\_per\_cell=(pix\_per\_cell, pix\_per\_cell), 
               cells\_per\_block=(cell\_per\_block, cell\_per\_block),
               block_norm= 'L1', transform_sqrt=False,
               visualize=False, feature_vector=False)
    hog3 = hog(ch3, orientations=orient,
               pixels\_per\_cell=(pix\_per\_cell, pix\_per\_cell), 
               cells\_per\_block=(cell\_per\_block, cell\_per\_block),
               block_norm= 'L1', transform_sqrt=False,
               visualize=False, feature_vector=False)

    hog_features = np.hstack((hog1, hog2, hog3)).ravel()
    return hog_features

定义一个函数,输入图片,输出图片的特征:


def extract_feature(image, color_space='RGB', spatial_size=(32, 32),
                     hist_bins=32, orient=9,
                     pix\_per\_cell=8, cell\_per\_block=2):
    \# 转换颜色空间
    if color_space != 'RGB':
        if color_space == 'HSV':
            feature_image = cv2.cvtColor(image, cv2.COLOR_RGB2HSV)
        elif color_space == 'LUV':
            feature_image = cv2.cvtColor(image, cv2.COLOR_RGB2LUV)
        elif color_space == 'HLS':
            feature_image = cv2.cvtColor(image, cv2.COLOR_RGB2HLS)
        elif color_space == 'YUV':
            feature_image = cv2.cvtColor(image, cv2.COLOR_RGB2YUV)
        elif color_space == 'YCrCb':
            feature_image = cv2.cvtColor(image, cv2.COLOR_RGB2YCrCb)
    else: feature_image = np.copy(image)      

    \# 图片的原始色值
    spatial_features = bin_spatial(feature_image, size=spatial_size)
    \# 对色值的统计
    hist_features = color_hist(feature_image, nbins=hist_bins)
    \# 方向梯度直方图
    hog_features = hog_image(feature_image,orient,pix\_per\_cell,cell\_per\_block)
    return np.concatenate((spatial_features, hist_features,hog_features))

对所有图片提取特征

def extract_features(imgs, color_space='RGB', spatial_size=(32, 32),
                     hist_bins=32, orient=9,
                     pix\_per\_cell=8, cell\_per\_block=2):
    features = \[\]
    \# 遍历所有文件
    for file in imgs:
        image = plt.imread(file)\[:,:,:3\]
        features.append(extract_feature(image,color_space=color_space, 
                        spatial_size=spatial_size, hist_bins=hist_bins, 
                        orient=orient, pix\_per\_cell=pix\_per\_cell, 
                        cell\_per\_block=cell\_per\_block))
        features.append(extract_feature(rotate(image,random.randint(-30,30)),color_space=color_space, 
                        spatial_size=spatial_size, hist_bins=hist_bins, 
                        orient=orient, pix\_per\_cell=pix\_per\_cell, 
                        cell\_per\_block=cell\_per\_block))
\#         features.append(extract\_feature(rotate(image,random.randint(-30,30)),color\_space=color_space, 
\#                         spatial\_size=spatial\_size, hist\_bins=hist\_bins, 
\#                         orient=orient, pix\_per\_cell=pix\_per\_cell, 
\#                         cell\_per\_block=cell\_per\_block))
\#         features.append(extract\_feature(affinetransformation(image,True),color\_space=color_space, 
\#                         spatial\_size=spatial\_size, hist\_bins=hist\_bins, 
\#                         orient=orient, pix\_per\_cell=pix\_per\_cell, 
\#                         cell\_per\_block=cell\_per\_block))
\#         features.append(extract\_feature(affinetransformation(image),color\_space=color_space, 
\#                         spatial\_size=spatial\_size, hist\_bins=hist\_bins, 
\#                         orient=orient, pix\_per\_cell=pix\_per\_cell, 
\#                         cell\_per\_block=cell\_per\_block))
\#         features.append(extract\_feature(cv2.flip(image,1),color\_space=color_space, 
\#                         spatial\_size=spatial\_size, hist\_bins=hist\_bins, 
\#                         orient=orient, pix\_per\_cell=pix\_per\_cell, 
\#                         cell\_per\_block=cell\_per\_block))
    return features

color_space = 'YUV' \# 可选取值 RGB, HSV, LUV, HLS, YUV, YCrCb
orient = 15  \# HOG orientations
pix\_per\_cell = 8 \# HOG pixels per cell
cell\_per\_block = 2 \# HOG cells per block
spatial_size = (32, 32) \# 原始数据尺寸
hist_bins = 32    \# 统计信息尺寸

logos = glob.glob('logo/embedLogo/*.jpg')
nologos = glob.glob('logo/unembedLogo/*.jpg')

logo_features = extract_features(logos, color_space=color_space, 
                        spatial_size=spatial_size, hist_bins=hist_bins, 
                        orient=orient, pix\_per\_cell=pix\_per\_cell, 
                        cell\_per\_block=cell\_per\_block)

nologo_features = extract_features(nologos, color_space=color_space, 
                        spatial_size=spatial_size, hist_bins=hist_bins, 
                        orient=orient, pix\_per\_cell=pix\_per\_cell, 
                        cell\_per\_block=cell\_per\_block)

训练模型

\# 标准化
X = np.vstack((logo_features, nologo_features))
X_scaler = StandardScaler().fit(X)
scaled_X = X_scaler.transform(X)

\# 定义标记值
y = np.hstack((np.ones(len(logo_features)), np.zeros(len(nologo_features))))

\# 分离训练集和测试集
rand_state = np.random.randint(0, 100)
X_train, X_test, y_train, y_test = train\_test\_split(scaled_X, y, test_size=0.2, random_state=rand_state)

print('特征向量长度:', len(X_train\[0\]))
print('训练样本数量:',len(X_train))
print('测试样本数量:',len(X_test))

\# 开始训练支持向量机
svc = LinearSVC()
t=time.time()
svc.fit(X_train, y_train)
t2 = time.time()
print('训练SVC耗时:',round(t2-t, 2), '秒')
print('Test Accuracy of SVC:', round(svc.score(X_test, y_test), 4))
特征向量长度: 24948
训练样本数量: 1969
测试样本数量: 493
训练SVC耗时: 9.61 秒
Test Accuracy of SVC: 0.9777

保存模型


obj = {"svc":svc,"X_scaler":X_scaler}
pickle.dump(obj, open("model.pickle", "wb"))

手机拍照测试

用自己手机对着电脑屏幕拍摄了几张logo,试试效果。


tests = glob.glob('logo/myphoto/haslogo/*.jpeg')

fig, axes = plt.subplots(2, 4, figsize=(8*4, 8))
aslist = axes.ravel()
test_features = \[\]
for i,path in enumerate(tests):
    #print(path)
    test = plt.imread(path)
    test = cv2.resize(test,(100,100))\[:,:,:3\]
    aslist\[i\].imshow(test)
    test_feature = extract_feature(test, color_space=color_space, 
                            spatial_size=spatial_size, hist_bins=hist_bins, 
                            orient=orient, pix\_per\_cell=pix\_per\_cell, 
                            cell\_per\_block=cell\_per\_block)
    test_features.append(test_feature)

image


scaled_test = X_scaler.transform(test_features)

svc.predict(scaled_test)

array([1., 1., 1., 1., 1., 1., 1., 0.])

在公寓随手拍了几张室内景,还有我炒的菜。。。


tests = glob.glob('logo/myphoto/nologo/*.jpeg')

fig, axes = plt.subplots(2, 4, figsize=(8*4, 8))
aslist = axes.ravel()
test_features = \[\]
for i,path in enumerate(tests\[:8\]):
    #print(path)
    test = plt.imread(path)
    test = cv2.resize(test,(100,100))\[:,:,:3\]
    aslist\[i\].imshow(test)
    test_feature = extract_feature(test, color_space=color_space, 
                            spatial_size=spatial_size, hist_bins=hist_bins, 
                            orient=orient, pix\_per\_cell=pix\_per\_cell, 
                            cell\_per\_block=cell\_per\_block)
    test_features.append(test_feature)

image


scaled_test = X_scaler.transform(test_features)

svc.predict(scaled_test)

array([0., 0., 0., 0., 0., 0., 0., 0.])

效果基本已经达到了,提升的空间肯定是有的,比如说背景图片只是我在google图片上随便找的,覆盖面不够广,图像增强我们只使用了一个旋转变换,参数还可以优化等等。


print("end")
posted @ 2019/02/28 23:30:48