Image, Video and Real-Time Webcam Object Detection & Instance Segmentation using Mask R-CNN

Introduction

Object Detection

Mask R-CNN

Mask R-CNN framework for instance segmentation. Source: https://arxiv.org/abs/1703.06870
4K Video Demo by Karol Majek.
Mask R-CNN results on the COCO test set. Masks are shown in color, and bounding box, category, and confidences are also shown
git clone https://github.com/matterport/Mask_RCNN.git
Folder Structure
All the dependencies for Mask R-CNN
import tensorflow as tf
print(tf.__version__)
import os
import sys
import random
import math
import numpy as np
import skimage.io
import matplotlib
import matplotlib.pyplot as plt
# Root directory of the project
ROOT_DIR = os.path.abspath("../")

# Import Mask RCNN
sys.path.append(ROOT_DIR) # To find local version of the library
from mrcnn import utils
import mrcnn.model as modellib
from mrcnn import visualize
# Import COCO config
sys.path.append(os.path.join(ROOT_DIR, "samples/coco/")) # To find local version
import coco

%matplotlib inline

# Directory to save logs and trained model
MODEL_DIR = os.path.join(ROOT_DIR, "logs")

# Local path to trained weights file
COCO_MODEL_PATH = os.path.join(ROOT_DIR, "mask_rcnn_coco.h5")
# Download COCO trained weights from Releases if needed
if not os.path.exists(COCO_MODEL_PATH):
utils.download_trained_weights(COCO_MODEL_PATH)

# Directory of images to run detection on
IMAGE_DIR = os.path.join(ROOT_DIR, "images")
class InferenceConfig(coco.CocoConfig):
# Set batch size to 1 since we'll be running inference on
# one image at a time. Batch size = GPU_COUNT * IMAGES_PER_GPU
GPU_COUNT = 1
IMAGES_PER_GPU = 1
config = InferenceConfig()
config.display()
# Create model object in inference mode.
model = modellib.MaskRCNN(mode="inference", model_dir=MODEL_DIR, config=config)
# Load weights trained on MS-COCO
model.load_weights(COCO_MODEL_PATH, by_name=True)
Configurations:
BACKBONE_SHAPES [[256 256]
[128 128]
[ 64 64]
[ 32 32]
[ 16 16]]
BACKBONE_STRIDES [4, 8, 16, 32, 64]
BATCH_SIZE 1
BBOX_STD_DEV [ 0.1 0.1 0.2 0.2]
DETECTION_MAX_INSTANCES 100
DETECTION_MIN_CONFIDENCE 0.5
DETECTION_NMS_THRESHOLD 0.3
GPU_COUNT 1
IMAGES_PER_GPU 1
IMAGE_MAX_DIM 1024
IMAGE_MIN_DIM 800
IMAGE_PADDING True
IMAGE_SHAPE [1024 1024 3]
LEARNING_MOMENTUM 0.9
LEARNING_RATE 0.002
MASK_POOL_SIZE 14
MASK_SHAPE [28, 28]
MAX_GT_INSTANCES 100
MEAN_PIXEL [ 123.7 116.8 103.9]
MINI_MASK_SHAPE (56, 56)
NAME coco
NUM_CLASSES 81
POOL_SIZE 7
POST_NMS_ROIS_INFERENCE 1000
POST_NMS_ROIS_TRAINING 2000
ROI_POSITIVE_RATIO 0.33
RPN_ANCHOR_RATIOS [0.5, 1, 2]
RPN_ANCHOR_SCALES (32, 64, 128, 256, 512)
RPN_ANCHOR_STRIDE 2
RPN_BBOX_STD_DEV [ 0.1 0.1 0.2 0.2]
RPN_TRAIN_ANCHORS_PER_IMAGE 256
STEPS_PER_EPOCH 1000
TRAIN_ROIS_PER_IMAGE 128
USE_MINI_MASK True
USE_RPN_ROIS True
VALIDATION_STEPS 50
WEIGHT_DECAY 0.0001
# COCO Class names (ids: 0-80)
class_names = [‘BG’, ‘person’, ‘bicycle’, ‘car’, ‘motorcycle’, ‘airplane’, ‘bus’, ‘train’, ‘truck’, ‘boat’, ‘traffic light’,
‘fire hydrant’, ‘stop sign’, ‘parking meter’, ‘bench’, ‘bird’,
‘cat’, ‘dog’, ‘horse’, ‘sheep’, ‘cow’, ‘elephant’, ‘bear’,
‘zebra’, ‘giraffe’, ‘backpack’, ‘umbrella’, ‘handbag’, ‘tie’,
‘suitcase’, ‘frisbee’, ‘skis’, ‘snowboard’, ‘sports ball’,
‘kite’, ‘baseball bat’, ‘baseball glove’, ‘skateboard’,
‘surfboard’, ‘tennis racket’, ‘bottle’, ‘wine glass’, ‘cup’,
‘fork’, ‘knife’, ‘spoon’, ‘bowl’, ‘banana’, ‘apple’,
‘sandwich’, ‘orange’, ‘broccoli’, ‘carrot’, ‘hot dog’, ‘pizza’,
‘donut’, ‘cake’, ‘chair’, ‘couch’, ‘potted plant’, ‘bed’,
‘dining table’, ‘toilet’, ‘tv’, ‘laptop’, ‘mouse’, ‘remote’,
‘keyboard’, ‘cell phone’, ‘microwave’, ‘oven’, ‘toaster’,
‘sink’, ‘refrigerator’, ‘book’, ‘clock’, ‘vase’, ‘scissors’,
‘teddy bear’, ‘hair drier’, ‘toothbrush’]
# Load a random image from the images folder
file_names = next(os.walk(IMAGE_DIR))[2]
image = skimage.io.imread(os.path.join(IMAGE_DIR, 'sh_expo.jpg'))
# Run detection
results = model.detect([image], verbose=1)
# Visualize results
r = results[0]
visualize.display_instances(image, r['rois'], r['masks'], r['class_ids'], class_names, r['scores'])
Processing 1 images image shape: (375, 500, 3) min: 0.00000 max: 255.00000 molded_images shape: (1, 1024, 1024, 3) min: -123.70000 max: 151.10000 image_metas shape: (1, 89) min: 0.00000 max: 1024.00000
import collections
print(r[‘class_ids’])
class_ids = r[‘class_ids’]
counter=collections.Counter(class_ids)
print(counter)
for key in counter:
print(“Object: {}, Quantity: {}”.format(class_names[key], counter[key]))
[ 3  3 10  3  1  3 10 10 10  3  3 10 10 10  1  3  1  3  1 10  1  3  1  1
10 1 1 3 3 3]
Counter({3: 12, 10: 9, 1: 9})
Object: car, Quantity: 12
Object: traffic light, Quantity: 9
Object: person, Quantity: 9
#custom image testing
file_names = next(os.walk(IMAGE_DIR))[2]
image = scipy.misc.imread(os.path.join(IMAGE_DIR, ‘toronto.jpg’))
# Run detection
results = model.detect([image], verbose=1)
# Visualize results
r = results[0]
visualize.display_instances(image, r[‘rois’], r[‘masks’], r[‘class_ids’], class_names, r[‘scores’])
Toronto

Video Object Detection

#Download video from YouTube
import pytube
from pytube import YouTube
import os
def downloadYouTube(videourl, path):
yt = YouTube(videourl)
yt = yt.streams.filter(progressive=True, file_extension='mp4').order_by('resolution').desc().first()
if not os.path.exists(path):
os.makedirs(path)
yt.download(path)
downloadYouTube('https://www.youtube.com/watch?v=itIQDcHGojk', './samples/videos')
# import libraries
import os, sys
import sys
import random
import math
import numpy as np
import scipy.misc
import coco
import utils
import model as modellib
import visualize
import matplotlib
import matplotlib.pyplot as plt
# define random colors
def random_colors(N):
np.random.seed(1)
colors = [tuple(255 * np.random.rand(3)) for _ in range(N)]
return colors
#apply mask to image
def apply_mask(image, mask, color, alpha=0.5):

for n, c in enumerate(color):
image[:, :, n] = np.where(
mask == 1,
image[:, :, n] * (1 — alpha) + alpha * c,
image[:, :, n]
)
return image
#take the image and apply the mask, box, and Label
def display_instances(image, boxes, masks, ids, names, scores):

n_instances = boxes.shape[0]
colors = random_colors(n_instances)
if not n_instances:
print(‘NO INSTANCES TO DISPLAY’)
else:
assert boxes.shape[0] == masks.shape[-1] == ids.shape[0]
for i, color in enumerate(colors):
if not np.any(boxes[i]):
continue
y1, x1, y2, x2 = boxes[i]
label = names[ids[i]]
score = scores[i] if scores is not None else None
caption = ‘{} {:.2f}’.format(label, score) if score else label
mask = masks[:, :, i]
image = apply_mask(image, mask, color)
image = cv2.rectangle(image, (x1, y1), (x2, y2), color, 2)
image = cv2.putText(
image, caption, (x1, y1), cv2.FONT_HERSHEY_COMPLEX, 0.7, color, 2
)
return image
#Mask R-CNN
ROOT_DIR = os.getcwd()
MODEL_DIR = os.path.join(ROOT_DIR, “logs”)
VIDEO_DIR = os.path.join(ROOT_DIR, “videos”)
VIDEO_SAVE_DIR = os.path.join(VIDEO_DIR, “savedimgs”)
COCO_MODEL_PATH = os.path.join(ROOT_DIR, “mask_rcnn_coco.h5”)
if not os.path.exists(COCO_MODEL_PATH):
utils.download_trained_weights(COCO_MODEL_PATH)
class InferenceConfig(coco.CocoConfig):
GPU_COUNT = 1
IMAGES_PER_GPU = batch_size
batch_size = 3
config = InferenceConfig()
config.print()
# Create model object in inference mode.
model = modellib.MaskRCNN(mode=”inference”, model_dir=MODEL_DIR, config=config)
# Load weights trained on MS-COCO
model.load_weights(COCO_MODEL_PATH, by_name=True)
class_names = [
‘BG’, ‘person’, ‘bicycle’, ‘car’, ‘motorcycle’, ‘airplane’,
‘bus’, ‘train’, ‘truck’, ‘boat’, ‘traffic light’,
‘fire hydrant’, ‘stop sign’, ‘parking meter’, ‘bench’, ‘bird’,
‘cat’, ‘dog’, ‘horse’, ‘sheep’, ‘cow’, ‘elephant’, ‘bear’,
‘zebra’, ‘giraffe’, ‘backpack’, ‘umbrella’, ‘handbag’, ‘tie’,
‘suitcase’, ‘frisbee’, ‘skis’, ‘snowboard’, ‘sports ball’,
‘kite’, ‘baseball bat’, ‘baseball glove’, ‘skateboard’,
‘surfboard’, ‘tennis racket’, ‘bottle’, ‘wine glass’, ‘cup’,
‘fork’, ‘knife’, ‘spoon’, ‘bowl’, ‘banana’, ‘apple’,
‘sandwich’, ‘orange’, ‘broccoli’, ‘carrot’, ‘hot dog’, ‘pizza’,
‘donut’, ‘cake’, ‘chair’, ‘couch’, ‘potted plant’, ‘bed’,
‘dining table’, ‘toilet’, ‘tv’, ‘laptop’, ‘mouse’, ‘remote’,
‘keyboard’, ‘cell phone’, ‘microwave’, ‘oven’, ‘toaster’,
‘sink’, ‘refrigerator’, ‘book’, ‘clock’, ‘vase’, ‘scissors’,
‘teddy bear’, ‘hair drier’, ‘toothbrush’
]

video = cv2.VideoCapture(os.path.join(VIDEO_DIR, ‘Toronto_Raptors.mp4’))
# Find OpenCV version
(major_ver, minor_ver, subminor_ver) = (cv2.__version__).split(‘.’)
if int(major_ver) < 3 :
fps = video.get(cv2.cv.CV_CAP_PROP_FPS)
print(“Frames per second using video.get(cv2.cv.CV_CAP_PROP_FPS): {0}”.format(fps))
else :
fps = video.get(cv2.CAP_PROP_FPS)
print(“Frames per second using video.get(cv2.CAP_PROP_FPS) : {0}”.format(fps))
try:
if not os.path.exists(VIDEO_SAVE_DIR):
os.makedirs(VIDEO_SAVE_DIR)
except OSError:
print (‘Error: Creating directory of data’)
frames = []
frame_count = 0
while True:
ret, frame = video.read()
if not ret:
break
# Save each frame of the video to a list
frame_count += 1
frames.append(frame)
print(‘frame_count :{0}’.format(frame_count))
if len(frames) == batch_size:
results = model.detect(frames, verbose=0)
print(‘Predicted’)
for i, item in enumerate(zip(frames, results)):
frame = item[0]
r = item[1]
frame = display_instances(
frame, r[‘rois’], r[‘masks’], r[‘class_ids’], class_names, r[‘scores’]
)
name = ‘{0}.jpg’.format(frame_count + i — batch_size)
name = os.path.join(VIDEO_SAVE_DIR, name)
cv2.imwrite(name, frame)
print(‘writing to file:{0}’.format(name))
# Clear the frames array to start the next batch
frames = []
video.release()
#Create a video from a list of segmented images.
import glob
import os
def make_video(outvid, images=None, fps=30, size=None,
is_color=True, format=”FMP4"):
from cv2 import VideoWriter, VideoWriter_fourcc, imread, resize
fourcc = VideoWriter_fourcc(*format)
vid = None
for image in images:
if not os.path.exists(image):
raise FileNotFoundError(image)
img = imread(image)
if vid is None:
if size is None:
size = img.shape[1], img.shape[0]
vid = VideoWriter(outvid, fourcc, float(fps), size, is_color)
if size[0] != img.shape[1] and size[1] != img.shape[0]:
img = resize(img, size)
vid.write(img)
vid.release()
return vid
# Path Configuration
ROOT_DIR = os.getcwd()
VIDEO_DIR = os.path.join(ROOT_DIR, “videos”)
VIDEO_SAVE_DIR = os.path.join(VIDEO_DIR, “savedimgs”)
images = list(glob.iglob(os.path.join(VIDEO_SAVE_DIR, ‘*.*’)))
# Sort the images by integer index
images = sorted(images, key=lambda x: float(os.path.split(x)[1][:-3]))
outvid = os.path.join(VIDEO_DIR, “raptor.mp4”)
make_video(outvid, images, fps=30)
The whole original video can be found at Toronto NBA Finals 2019 (The Guardian) on YouTube

Real-Time Webcam Object Detection

import multiprocessing as mp
num_workers = mp.cpu_count()
print(num_workers)
from tensorflow.python.client import device_lib
device_lib.list_local_devices()
8
[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 15753943118674977122, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 1485760102
locality {
bus_id: 1
links {
}
}
incarnation: 6587695850660979430
physical_device_desc: "device: 0, name: GeForce GT 710, pci bus id: 0000:01:00.0, compute capability: 3.5"]
# Mask R-CNN 
import os
import sys
import cv2
import time
import imutils
import numpy as np
import mrcnn.model as modellib
from mrcnn import utils, visualize
from imutils.video import WebcamVideoStream
import random
# Root directory of the project
from samples.coco.coco import CocoConfig
ROOT_DIR = os.path.abspath(“./”)sys.path.append(os.path.join(ROOT_DIR, “samples/coco/”)) # To find local version# Directory to save logs and trained model
MODEL_DIR = os.path.join(ROOT_DIR, “logs”)
# Local path to trained weights file
COCO_MODEL_PATH = os.path.join(ROOT_DIR, “mask_rcnn_coco.h5”)
# Download COCO trained weights from Releases if needed
if not os.path.exists(COCO_MODEL_PATH):
utils.download_trained_weights(COCO_MODEL_PATH)
class InferenceConfig(CocoConfig):
# Set batch size to 1 since we’ll be running inference on
# one image at a time. Batch size = GPU_COUNT * IMAGES_PER_GPU
GPU_COUNT = 1
IMAGES_PER_GPU = 1
config = InferenceConfig()
config.display()
# Create model object in inference mode.
model = modellib.MaskRCNN(mode=”inference”, model_dir=MODEL_DIR, config=config)
# Load weights trained on MS-COCO
model.load_weights(COCO_MODEL_PATH, by_name=True)
# Define COCO Class names
class_names = [‘BG’, ‘person’, ‘bicycle’, ‘car’, ‘motorcycle’, ‘airplane’, ‘bus’, ‘train’, ‘truck’, ‘boat’, ‘traffic light’,
‘fire hydrant’, ‘stop sign’, ‘parking meter’, ‘bench’, ‘bird’,
‘cat’, ‘dog’, ‘horse’, ‘sheep’, ‘cow’, ‘elephant’, ‘bear’,
‘zebra’, ‘giraffe’, ‘backpack’, ‘umbrella’, ‘handbag’, ‘tie’,
‘suitcase’, ‘frisbee’, ‘skis’, ‘snowboard’, ‘sports ball’,
‘kite’, ‘baseball bat’, ‘baseball glove’, ‘skateboard’,
‘surfboard’, ‘tennis racket’, ‘bottle’, ‘wine glass’, ‘cup’,
‘fork’, ‘knife’, ‘spoon’, ‘bowl’, ‘banana’, ‘apple’,
‘sandwich’, ‘orange’, ‘broccoli’, ‘carrot’, ‘hot dog’, ‘pizza’,
‘donut’, ‘cake’, ‘chair’, ‘couch’, ‘potted plant’, ‘bed’,
‘dining table’, ‘toilet’, ‘tv’, ‘laptop’, ‘mouse’, ‘remote’,
‘keyboard’, ‘cell phone’, ‘microwave’, ‘oven’, ‘toaster’,
‘sink’, ‘refrigerator’, ‘book’, ‘clock’, ‘vase’, ‘scissors’,
‘teddy bear’, ‘hair drier’, ‘toothbrush’]
# Open webcam at the ID 0
cap = cv2.VideoCapture(0)#Check whether user selected camera is opened successfully.
if not (cap.isOpened()):
print(“Could not open video device”)
# Real-Time Webcam Object Detection
colors = visualize.random_colors(len(class_names))
gentle_grey = (45, 65, 79)
white = (255, 255, 255)
OPTIMIZE_CAM = False
SHOW_FPS = False
SHOW_FPS_WO_COUNTER = True # faster
PROCESS_IMG = True
if OPTIMIZE_CAM:
vs = WebcamVideoStream(src=0).start()
else:
vs = cv2.VideoCapture(0)
if SHOW_FPS:
fps_caption = “FPS: 0”
fps_counter = 0
start_time = time.time()
SCREEN_NAME = ‘Real-Time Webcam’
cv2.namedWindow(SCREEN_NAME, cv2.WINDOW_NORMAL)
cv2.setWindowProperty(SCREEN_NAME, cv2.WND_PROP_FULLSCREEN, cv2.WINDOW_FULLSCREEN)
# Capture frame-by-frame
while True:
if OPTIMIZE_CAM:
frame = vs.read()
else:
grabbed, frame = vs.read()
if not grabbed:
break

if SHOW_FPS_WO_COUNTER:
start_time = time.time() # start time of the loop
if PROCESS_IMG:
results = model.detect([frame])
r = results[0]
# Run detection
masked_image = visualize.display_instances_10fps(frame, r[‘rois’], r[‘masks’],
r[‘class_ids’], class_names, r[‘scores’], colors=colors, real_time=True)

if PROCESS_IMG:
s = masked_image
else:
s = frame
# print(“Image shape: {1}x{0}”.format(s.shape[0], s.shape[1]))
width = s.shape[1]
height = s.shape[0]
top_left_corner = (width-120, height-20)
bott_right_corner = (width, height)
top_left_corner_cvtext = (width-80, height-5)
if SHOW_FPS:
fps_counter+=1
if (time.time() — start_time) > 5 : # every 5 second
fps_caption = “FPS: {:.0f}”.format(fps_counter / (time.time() — start_time))
# print(fps_caption)
fps_counter = 0
start_time = time.time()
ret, baseline = cv2.getTextSize(fps_caption, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 1)
cv2.rectangle(s, (width — ret[0], height — ret[1] — baseline), bott_right_corner, gentle_grey, -1)
cv2.putText(s,fps_caption, (width — ret[0], height — baseline), cv2.FONT_HERSHEY_SIMPLEX, 0.5, white, lineType=cv2.LINE_AA)
if SHOW_FPS_WO_COUNTER:
# Display the resulting frame
fps_caption = “FPS: {:.0f}”.format(1.0 / (time.time() — start_time))
# print(“FPS: “, 1.0 / (time.time() — start_time))

# Put the rectangle and text on the bottom left corner
ret, baseline = cv2.getTextSize(fps_caption, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 1)
cv2.rectangle(s, (width — ret[0], height — ret[1] — baseline), bott_right_corner, gentle_grey, -1)
cv2.putText(s, fps_caption, (width — ret[0], height — baseline),
cv2.FONT_HERSHEY_SIMPLEX, 0.5, white, 1, lineType=cv2.LINE_AA)
#Display the frame
s = cv2.resize(s,(1920,1080))
cv2.imshow(SCREEN_NAME, s)
cv2.waitKey(1)
# if cv2.waitKey(1) & 0xFF == ord(‘q’):
# break

# When everything is done, release the camera from video capture
if OPTIMIZE_CAM:
vs.stop()
else:
vs.release()
cv2.destroyAllWindows()
Real-time Webcam Object Instant Segmentation

Senior Geospatial Specialist in Toronto

Get the Medium app

A button that says 'Download on the App Store', and if clicked it will lead you to the iOS App store
A button that says 'Get it on, Google Play', and if clicked it will lead you to the Google Play store