import numpy as np
import pandas as pd
import cv2, os, glob
import xml.etree.ElementTree as ET
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras import Model
from tensorflow.keras.layers import (
Add, Concatenate, Conv2D,
Input, Lambda, LeakyReLU,
MaxPool2D, UpSampling2D, ZeroPadding2D
)
from tensorflow.keras.regularizers import l2
from tensorflow.keras.losses import (
binary_crossentropy,
sparse_categorical_crossentropy
)
YOLOV3_LAYER_LIST = [
'yolo_darknet',
'yolo_conv_0',
'yolo_output_0',
'yolo_conv_1',
'yolo_output_1',
'yolo_conv_2',
'yolo_output_2',
]
yolo_anchors = np.array([
(10, 13), (16, 30), (33, 23), (30, 61), (62, 45),
(59, 119), (116, 90), (156, 198), (373, 326)],
np.float32) / 416
yolo_anchor_masks = np.array([[6, 7, 8], [3, 4, 5], [0, 1, 2]])
class_names = [
'person', 'bicycle','car','motorbike','aeroplane','bus','train','truck','boat',
'traffic light','fire hydrant','stop sign','parking meter','bench',
'bird','cat','dog','horse','sheep','cow','elephant','bear','zebra',
'giraffe','backpack','umbrella','handbag','tie','suitcase','frisbee',
'skis','snowboard','sports ball','kite','baseball bat','baseball glove',
'skateboard','surfboard','tennis racket','bottle','wine glass','cup',
'fork','knife','spoon','bowl','banana','apple','sandwich','orange',
'broccoli','carrot','hot dog','pizza','donut','cake','chair','sofa',
'pottedplant','bed','diningtable','toilet','tvmonitor','laptop','mouse',
'remote','keyboard','cell phone','microwave','oven','toaster','sink',
'refrigerator','book','clock','vase','scissors','teddy bear',
'hair drier','toothbrush'
]
def load_darknet_weights(model, weights_file):
wf = open(weights_file, 'rb')
major, minor, revision, seen, _ = np.fromfile(wf, dtype=np.int32, count=5)
layers = YOLOV3_LAYER_LIST # List of layers as defined in your code
for layer_name in layers:
sub_model = model.get_layer(layer_name)
for i, layer in enumerate(sub_model.layers):
if not layer.name.startswith('conv2d'):
continue
batch_norm = None
if i + 1 < len(sub_model.layers) and sub_model.layers[i + 1].name.startswith('batch_norm'):
batch_norm = sub_model.layers[i + 1]
filters = layer.filters
size = layer.kernel_size[0]
in_dim = layer.input.shape[-1] # Use layer.input.shape for input dimension
if batch_norm is None:
conv_bias = np.fromfile(wf, dtype=np.float32, count=filters)
else:
bn_weights = np.fromfile(wf, dtype=np.float32, count=4 * filters)
bn_weights = bn_weights.reshape((4, filters))[[1, 0, 2, 3]]
conv_shape = (filters, in_dim, size, size)
conv_weights = np.fromfile(wf, dtype=np.float32, count=np.product(conv_shape))
conv_weights = conv_weights.reshape(conv_shape).transpose([2, 3, 1, 0])
if batch_norm is None:
layer.set_weights([conv_weights, conv_bias])
else:
layer.set_weights([conv_weights])
batch_norm.set_weights(bn_weights)
assert len(wf.read()) == 0, 'failed to read all data'
wf.close()
def broadcast_iou(box_1, box_2):
# broadcast boxes
box_1 = tf.expand_dims(box_1, -2)
box_2 = tf.expand_dims(box_2, 0)
# new_shape: (..., N, (x1, y1, x2, y2))
new_shape = tf.broadcast_dynamic_shape(tf.shape(box_1), tf.shape(box_2))
box_1 = tf.broadcast_to(box_1, new_shape)
box_2 = tf.broadcast_to(box_2, new_shape)
int_w = tf.maximum(tf.minimum(box_1[..., 2], box_2[..., 2]) - tf.maximum(box_1[..., 0], box_2[..., 0]), 0)
int_h = tf.maximum(tf.minimum(box_1[..., 3], box_2[..., 3]) - tf.maximum(box_1[..., 1], box_2[..., 1]), 0)
int_area = int_w * int_h
box_1_area = (box_1[..., 2] - box_1[..., 0]) * (box_1[..., 3] - box_1[..., 1])
box_2_area = (box_2[..., 2] - box_2[..., 0]) * (box_2[..., 3] - box_2[..., 1])
return int_area / (box_1_area + box_2_area - int_area)
def freeze_all(model, frozen = True):
model.trainable = not frozen
if isinstance(model, tf.keras.Model):
for l in model.layers:
freeze_all(l, frozen)
def draw_outputs(img, outputs, class_names):
boxes, objectness, classes, nums = outputs
boxes, objectness, classes, nums = boxes[0], objectness[0], classes[0], nums[0]
wh = np.flip(img.shape[0:2])
for i in range(nums):
x1y1 = tuple((np.array(boxes[i][0:2]) * wh).astype(np.int32))
x2y2 = tuple((np.array(boxes[i][2:4]) * wh).astype(np.int32))
img = cv2.rectangle(img, x1y1, x2y2, (255, 0, 0), 2)
img = cv2.putText(img, '{} {:.4f}'.format(
class_names[int(classes[i])], objectness[i]),
x1y1, cv2.FONT_HERSHEY_COMPLEX_SMALL, 1, (0, 0, 255), 2)
return img
def transform_images(x_train, size):
x_train = tf.image.resize(x_train, (size, size))
x_train = x_train / 255
return x_train
@tf.function
def transform_targets_for_output(y_true, grid_size, anchor_idxs, classes):
N = tf.shape(y_true)[0]
y_true_out = tf.zeros(
(N, grid_size, grid_size, tf.shape(anchor_idxs)[0], 6))
anchor_idxs = tf.cast(anchor_idxs, tf.int32)
indexes = tf.TensorArray(tf.int32, 1, dynamic_size=True)
updates = tf.TensorArray(tf.float32, 1, dynamic_size=True)
idx = 0
for i in tf.range(N):
for j in tf.range(tf.shape(y_true)[1]):
if tf.equal(y_true[i][j][2], 0):
continue
anchor_eq = tf.equal(
anchor_idxs, tf.cast(y_true[i][j][5], tf.int32))
if tf.reduce_any(anchor_eq):
box = y_true[i][j][0:4]
box_xy = (y_true[i][j][0:2] + y_true[i][j][2:4]) / 2
anchor_idx = tf.cast(tf.where(anchor_eq), tf.int32)
grid_xy = tf.cast(box_xy // (1/grid_size), tf.int32)
indexes = indexes.write(
idx, [i, grid_xy[1], grid_xy[0], anchor_idx[0][0]])
updates = updates.write(
idx, [box[0], box[1], box[2], box[3], 1, y_true[i][j][4]])
idx += 1
return tf.tensor_scatter_nd_update(
y_true_out, indexes.stack(), updates.stack())
def transform_targets(y_train, anchors, anchor_masks, classes):
y_outs = []
grid_size = 13
anchors = tf.cast(anchors, tf.float32)
anchor_area = anchors[..., 0] * anchors[..., 1]
box_wh = y_train[..., 2:4] - y_train[..., 0:2]
box_wh = tf.tile(tf.expand_dims(box_wh, -2), (1, 1, tf.shape(anchors)[0], 1))
box_area = box_wh[..., 0] * box_wh[..., 1]
intersection = tf.minimum(box_wh[..., 0], anchors[..., 0]) * tf.minimum(box_wh[..., 1], anchors[..., 1])
iou = intersection / (box_area + anchor_area - intersection)
anchor_idx = tf.cast(tf.argmax(iou, axis=-1), tf.float32)
anchor_idx = tf.expand_dims(anchor_idx, axis=-1)
y_train = tf.concat([y_train, anchor_idx], axis=-1)
for anchor_idxs in anchor_masks:
y_outs.append(transform_targets_for_output(
y_train, grid_size, anchor_idxs, classes))
grid_size *= 2
return tuple(y_outs)
class BatchNormalization(tf.keras.layers.BatchNormalization):
@tf.function
def call(self, x, training=False):
if training is None:
training = tf.constant(False)
training = tf.logical_and(training, self.trainable)
return super(BatchNormalization, self).call(x, training=training)
def DarknetConv(x, filters, size, strides=1, batch_norm=True):
if strides == 1:
padding = 'same'
else:
x = ZeroPadding2D(((1, 0), (1, 0)))(x) # top left half-padding
padding = 'valid'
x = Conv2D(filters=filters, kernel_size=size,
strides=strides, padding=padding,
use_bias=not batch_norm, kernel_regularizer=l2(0.0005))(x)
if batch_norm:
x = BatchNormalization()(x)
x = LeakyReLU(alpha=0.1)(x)
return x
def DarknetResidual(x, filters):
prev = x
x = DarknetConv(x, filters // 2, 1)
x = DarknetConv(x, filters, 3)
x = Add()([prev, x]) # Ensure addition is valid
return x # Return the tensor
def DarknetBlock(x, filters, blocks):
x = DarknetConv(x, filters, 3, strides=2)
for _ in range(blocks):
x = DarknetResidual(x, filters) # Ensure residual connection
return x # Return the tensor
def Darknet(name=None):
x = inputs = Input([None, None, 3])
x = DarknetConv(x, 32, 3)
x = DarknetBlock(x, 64, 1)
x = DarknetBlock(x, 128, 2) # skip connection
x = x_36 = DarknetBlock(x, 256, 8) # skip connection
x = x_61 = DarknetBlock(x, 512, 8)
x = DarknetBlock(x, 1024, 4)
return tf.keras.Model(inputs, (x_36, x_61, x), name=name)
def YoloConv(x_in, filters, name=None):
if isinstance(x_in, tuple):
inputs = Input(x_in[0].shape[1:]), Input(x_in[1].shape[1:])
x, x_skip = inputs
# concat with skip connection
x = DarknetConv(x, filters, 1)
x = UpSampling2D(2)(x)
x = Concatenate()([x, x_skip])
else:
x = inputs = Input(x_in.shape[1:])
x = DarknetConv(x, filters, 1)
x = DarknetConv(x, filters * 2, 3)
x = DarknetConv(x, filters, 1)
x = DarknetConv(x, filters * 2, 3)
x = DarknetConv(x, filters, 1)
return Model(inputs, x, name=name)(x_in)
def YoloOutput(x_in, filters, anchors, classes, name=None):
x = inputs = Input(x_in.shape[1:])
x = DarknetConv(x, filters * 2, 3)
x = DarknetConv(x, anchors * (classes + 5), 1, batch_norm=False)
x = Lambda(lambda x: tf.reshape(x, (-1, tf.shape(x)[1], tf.shape(x)[2], anchors, classes + 5)))(x)
return tf.keras.Model(inputs, x, name=name)(x_in)
def yolo_boxes(pred, anchors, classes):
'''pred: (batch_size, grid, grid, anchors, (x, y, w, h, obj, ...classes))'''
grid_size = tf.shape(pred)[1]
box_xy, box_wh, objectness, class_probs = tf.split(
pred, (2, 2, 1, classes), axis=-1)
box_xy = tf.sigmoid(box_xy)
objectness = tf.sigmoid(objectness)
class_probs = tf.sigmoid(class_probs)
pred_box = tf.concat((box_xy, box_wh), axis=-1) # original xywh for loss
grid = tf.meshgrid(tf.range(grid_size), tf.range(grid_size))
grid = tf.expand_dims(tf.stack(grid, axis=-1), axis=2) # [gx, gy, 1, 2]
box_xy = (box_xy + tf.cast(grid, tf.float32)) / \
tf.cast(grid_size, tf.float32)
box_wh = tf.exp(box_wh) * anchors
box_x1y1 = box_xy - box_wh / 2
box_x2y2 = box_xy + box_wh / 2
bbox = tf.concat([box_x1y1, box_x2y2], axis=-1)
return bbox, objectness, class_probs, pred_box
def yolo_nms(outputs, anchors, masks, classes):
'''boxes, conf, type'''
b, c, t = [], [], []
for o in outputs:
b.append(tf.reshape(o[0], (tf.shape(o[0])[0], -1, tf.shape(o[0])[-1])))
c.append(tf.reshape(o[1], (tf.shape(o[1])[0], -1, tf.shape(o[1])[-1])))
t.append(tf.reshape(o[2], (tf.shape(o[2])[0], -1, tf.shape(o[2])[-1])))
bbox = tf.concat(b, axis=1)
confidence = tf.concat(c, axis=1)
class_probs = tf.concat(t, axis=1)
scores = confidence * class_probs
boxes, scores, classes, valid_detections = tf.image.combined_non_max_suppression(
boxes=tf.reshape(bbox, (tf.shape(bbox)[0], -1, 1, 4)),
scores=tf.reshape(
scores,
(tf.shape(scores)[0], -1, tf.shape(scores)[-1])
),
max_output_size_per_class=100,
max_total_size = 100,
iou_threshold = 0.5,
score_threshold = 0.5
)
return boxes, scores, classes, valid_detections
def YoloV3(size=None, channels=3, anchors=yolo_anchors, masks=yolo_anchor_masks, classes=80, training=False):
x = inputs = Input([size, size, channels])
x_36, x_61, x = Darknet(name='yolo_darknet')(x)
x = YoloConv(x, 512, name='yolo_conv_0')
output_0 = YoloOutput(x, 512, len(masks[0]), classes, name='yolo_output_0')
x = YoloConv((x, x_61), 256, name='yolo_conv_1')
output_1 = YoloOutput(x, 256, len(masks[1]), classes, name='yolo_output_1')
x = YoloConv((x, x_36), 128, name='yolo_conv_2')
output_2 = YoloOutput(x, 128, len(masks[2]), classes, name='yolo_output_2')
if training:
return Model(inputs, (output_0, output_1, output_2), name='yolov3')
boxes_0 = Lambda(lambda x: yolo_boxes(x, anchors[masks[0]], classes),
name='yolo_boxes_0')(output_0)
boxes_1 = Lambda(lambda x: yolo_boxes(x, anchors[masks[1]], classes),
name='yolo_boxes_1')(output_1)
boxes_2 = Lambda(lambda x: yolo_boxes(x, anchors[masks[2]], classes),
name='yolo_boxes_2')(output_2)
outputs = Lambda(lambda x: yolo_nms(x, anchors, masks, classes),
name='yolo_nms')((boxes_0[:3], boxes_1[:3], boxes_2[:3]))
return Model(inputs, outputs, name='yolov3')
def YoloLoss(anchors, classes=80, ignore_thresh=0.5):
def yolo_loss(y_true, y_pred):
# 1. transform all pred outputs
# y_pred: (batch_size, grid, grid, anchors, (x, y, w, h, obj, ...cls))
pred_box, pred_obj, pred_class, pred_xywh = yolo_boxes(y_pred, anchors, classes)
pred_xy = pred_xywh[..., 0:2]
pred_wh = pred_xywh[..., 2:4]
# 2. transform all true outputs
# y_true: (batch_size, grid, grid, anchors, (x1, y1, x2, y2, obj, cls))
true_box, true_obj, true_class_idx = tf.split(
y_true, (4, 1, 1), axis=-1)
true_xy = (true_box[..., 0:2] + true_box[..., 2:4]) / 2
true_wh = true_box[..., 2:4] - true_box[..., 0:2]
# give higher weights to small boxes
box_loss_scale = 2 - true_wh[..., 0] * true_wh[..., 1]
# 3. inverting the pred box equations
grid_size = tf.shape(y_true)[1]
grid = tf.meshgrid(tf.range(grid_size), tf.range(grid_size))
grid = tf.expand_dims(tf.stack(grid, axis=-1), axis=2)
true_xy = true_xy * tf.cast(grid_size, tf.float32) - \
tf.cast(grid, tf.float32)
true_wh = tf.math.log(true_wh / anchors)
true_wh = tf.where(tf.math.is_inf(true_wh), tf.zeros_like(true_wh), true_wh)
# 4. calculate all masks
obj_mask = tf.squeeze(true_obj, -1)
# ignore false positive when iou is over threshold
true_box_flat = tf.boolean_mask(true_box, tf.cast(obj_mask, tf.bool))
best_iou = tf.reduce_max(broadcast_iou(
pred_box, true_box_flat), axis=-1)
ignore_mask = tf.cast(best_iou < ignore_thresh, tf.float32)
# 5. calculate all losses
xy_loss = obj_mask * box_loss_scale * \
tf.reduce_sum(tf.square(true_xy - pred_xy), axis=-1)
wh_loss = obj_mask * box_loss_scale * \
tf.reduce_sum(tf.square(true_wh - pred_wh), axis=-1)
obj_loss = binary_crossentropy(true_obj, pred_obj)
obj_loss = obj_mask * obj_loss + \
(1 - obj_mask) * ignore_mask * obj_loss
# Could also use binary_crossentropy instead
class_loss = obj_mask * sparse_categorical_crossentropy(
true_class_idx, pred_class)
# 6. sum over (batch, gridx, gridy, anchors) => (batch, 1)
xy_loss = tf.reduce_sum(xy_loss, axis=(1, 2, 3))
wh_loss = tf.reduce_sum(wh_loss, axis=(1, 2, 3))
obj_loss = tf.reduce_sum(obj_loss, axis=(1, 2, 3))
class_loss = tf.reduce_sum(class_loss, axis=(1, 2, 3))
return xy_loss + wh_loss + obj_loss + class_loss
return yolo_loss
yolo = YoloV3(size=416, classes=80)
yolo.summary()
from tensorflow.keras.utils import plot_model
plot_model(
yolo, rankdir = 'TB',
to_file = 'yolo_model1.png',
show_shapes = False,
show_layer_names = True,
expand_nested = False
)
load_darknet_weights(yolo, '/content/yolov3.weights')
def predict(image_file, visualize = True, figsize = (16, 16)):
img = tf.image.decode_image(open(image_file, 'rb').read(), channels=3)
img = tf.expand_dims(img, 0)
img = transform_images(img, 416)
boxes, scores, classes, nums = yolo.predict(img)
img = cv2.cvtColor(cv2.imread(image_file), cv2.COLOR_BGR2RGB)
img = draw_outputs(img, (boxes, scores, classes, nums), class_names)
if visualize:
fig, axes = plt.subplots(figsize = figsize)
plt.imshow(img)
plt.show()
return boxes, scores, classes, nums
image_file = '/content/Dog and Cat Image.jpg'
import tensorflow as tf
import cv2
import matplotlib.pyplot as plt
# Updated prediction function with scaled bounding boxes
def predict(image_file, visualize=True, figsize=(16, 16)):
# Load and preprocess the image
img = tf.image.decode_image(open(image_file, 'rb').read(), channels=3)
img = tf.expand_dims(img, 0)
img = transform_images(img, 416)
# Get predictions
boxes, scores, classes, nums = yolo.predict(img)
# Convert the original image for plotting
img = cv2.cvtColor(cv2.imread(image_file), cv2.COLOR_BGR2RGB)
img_height, img_width, _ = img.shape
# Plotting the results with scaled boxes
if visualize:
plt.figure(figsize=figsize)
for i in range(nums[0]):
# Extract and scale box coordinates to pixel values
y1, x1, y2, x2 = boxes[0][i]
x1 = int(x1 * img_width)
y1 = int(y1 * img_height)
x2 = int(x2 * img_width)
y2 = int(y2 * img_height)
# Draw bounding box
cv2.rectangle(img, (x1, y1), (x2, y2), (255, 0, 0), 2)
# Prepare label with class and score
class_name = class_names[int(classes[0][i])]
score = scores[0][i]
label = f"{class_name}: {score:.2f}"
# Display label above the box
cv2.putText(img, label, (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 2)
# Show the final image with annotations
plt.imshow(img)
plt.axis('off')
plt.show()
return boxes, scores, classes, nums
# Define the path to the image
image_file = '/content/lamb-8287365_1280.jpg'
# Run prediction and plot
predict(image_file)