Blame view

3rdparty/opencv-4.5.4/samples/dnn/human_parsing.py 7.81 KB
f4334277   Hu Chunming   提交3rdparty
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
  #!/usr/bin/env python
  '''
  You can download the converted pb model from https://www.dropbox.com/s/qag9vzambhhkvxr/lip_jppnet_384.pb?dl=0
  or convert the model yourself.
  
  Follow these steps if you want to convert the original model yourself:
      To get original .meta pre-trained model download https://drive.google.com/file/d/1BFVXgeln-bek8TCbRjN6utPAgRE0LJZg/view
      For correct convert .meta to .pb model download original repository https://github.com/Engineering-Course/LIP_JPPNet
      Change script evaluate_parsing_JPPNet-s2.py for human parsing
      1. Remove preprocessing to create image_batch_origin:
          with tf.name_scope("create_inputs"):
          ...
      Add
          image_batch_origin = tf.placeholder(tf.float32, shape=(2, None, None, 3), name='input')
  
      2. Create input
          image = cv2.imread(path/to/image)
          image_rev = np.flip(image, axis=1)
          input = np.stack([image, image_rev], axis=0)
  
      3. Hardcode image_h and image_w shapes to determine output shapes.
         We use default INPUT_SIZE = (384, 384) from evaluate_parsing_JPPNet-s2.py.
          parsing_out1 = tf.reduce_mean(tf.stack([tf.image.resize_images(parsing_out1_100, INPUT_SIZE),
                                                  tf.image.resize_images(parsing_out1_075, INPUT_SIZE),
                                                  tf.image.resize_images(parsing_out1_125, INPUT_SIZE)]), axis=0)
         Do similarly with parsing_out2, parsing_out3
      4. Remove postprocessing. Last net operation:
          raw_output = tf.reduce_mean(tf.stack([parsing_out1, parsing_out2, parsing_out3]), axis=0)
         Change:
          parsing_ = sess.run(raw_output, feed_dict={'input:0': input})
  
      5. To save model after sess.run(...) add:
          input_graph_def = tf.get_default_graph().as_graph_def()
          output_node = "Mean_3"
          output_graph_def = tf.graph_util.convert_variables_to_constants(sess, input_graph_def, output_node)
  
          output_graph = "LIP_JPPNet.pb"
          with tf.gfile.GFile(output_graph, "wb") as f:
              f.write(output_graph_def.SerializeToString())'
  '''
  
  import argparse
  import os.path
  import numpy as np
  import cv2 as cv
  
  
  backends = (cv.dnn.DNN_BACKEND_DEFAULT, cv.dnn.DNN_BACKEND_INFERENCE_ENGINE, cv.dnn.DNN_BACKEND_OPENCV,
              cv.dnn.DNN_BACKEND_VKCOM, cv.dnn.DNN_BACKEND_CUDA)
  targets = (cv.dnn.DNN_TARGET_CPU, cv.dnn.DNN_TARGET_OPENCL, cv.dnn.DNN_TARGET_OPENCL_FP16, cv.dnn.DNN_TARGET_MYRIAD,
             cv.dnn.DNN_TARGET_HDDL, cv.dnn.DNN_TARGET_VULKAN, cv.dnn.DNN_TARGET_CUDA, cv.dnn.DNN_TARGET_CUDA_FP16)
  
  
  def preprocess(image):
      """
      Create 4-dimensional blob from image and flip image
      :param image: input image
      """
      image_rev = np.flip(image, axis=1)
      input = cv.dnn.blobFromImages([image, image_rev], mean=(104.00698793, 116.66876762, 122.67891434))
      return input
  
  
  def run_net(input, model_path, backend, target):
      """
      Read network and infer model
      :param model_path: path to JPPNet model
      :param backend: computation backend
      :param target: computation device
      """
      net = cv.dnn.readNet(model_path)
      net.setPreferableBackend(backend)
      net.setPreferableTarget(target)
      net.setInput(input)
      out = net.forward()
      return out
  
  
  def postprocess(out, input_shape):
      """
      Create a grayscale human segmentation
      :param out: network output
      :param input_shape: input image width and height
      """
      # LIP classes
      # 0 Background
      # 1 Hat
      # 2 Hair
      # 3 Glove
      # 4 Sunglasses
      # 5 UpperClothes
      # 6 Dress
      # 7 Coat
      # 8 Socks
      # 9 Pants
      # 10 Jumpsuits
      # 11 Scarf
      # 12 Skirt
      # 13 Face
      # 14 LeftArm
      # 15 RightArm
      # 16 LeftLeg
      # 17 RightLeg
      # 18 LeftShoe
      # 19 RightShoe
      head_output, tail_output = np.split(out, indices_or_sections=[1], axis=0)
      head_output = head_output.squeeze(0)
      tail_output = tail_output.squeeze(0)
  
      head_output = np.stack([cv.resize(img, dsize=input_shape) for img in head_output[:, ...]])
      tail_output = np.stack([cv.resize(img, dsize=input_shape) for img in tail_output[:, ...]])
  
      tail_list = np.split(tail_output, indices_or_sections=list(range(1, 20)), axis=0)
      tail_list = [arr.squeeze(0) for arr in tail_list]
      tail_list_rev = [tail_list[i] for i in range(14)]
      tail_list_rev.extend([tail_list[15], tail_list[14], tail_list[17], tail_list[16], tail_list[19], tail_list[18]])
      tail_output_rev = np.stack(tail_list_rev, axis=0)
      tail_output_rev = np.flip(tail_output_rev, axis=2)
      raw_output_all = np.mean(np.stack([head_output, tail_output_rev], axis=0), axis=0, keepdims=True)
      raw_output_all = np.argmax(raw_output_all, axis=1)
      raw_output_all = raw_output_all.transpose(1, 2, 0)
      return raw_output_all
  
  
  def decode_labels(gray_image):
      """
      Colorize image according to labels
      :param gray_image: grayscale human segmentation result
      """
      height, width, _ = gray_image.shape
      colors = [(0, 0, 0), (128, 0, 0), (255, 0, 0), (0, 85, 0), (170, 0, 51), (255, 85, 0),
                (0, 0, 85), (0, 119, 221), (85, 85, 0), (0, 85, 85), (85, 51, 0), (52, 86, 128),
                (0, 128, 0), (0, 0, 255), (51, 170, 221), (0, 255, 255),(85, 255, 170),
                (170, 255, 85), (255, 255, 0), (255, 170, 0)]
  
      segm = np.stack([colors[idx] for idx in gray_image.flatten()])
      segm = segm.reshape(height, width, 3).astype(np.uint8)
      segm = cv.cvtColor(segm, cv.COLOR_BGR2RGB)
      return segm
  
  
  def parse_human(image, model_path, backend=cv.dnn.DNN_BACKEND_OPENCV, target=cv.dnn.DNN_TARGET_CPU):
      """
      Prepare input for execution, run net and postprocess output to parse human.
      :param image: input image
      :param model_path: path to JPPNet model
      :param backend: name of computation backend
      :param target: name of computation target
      """
      input = preprocess(image)
      input_h, input_w = input.shape[2:]
      output = run_net(input, model_path, backend, target)
      grayscale_out = postprocess(output, (input_w, input_h))
      segmentation = decode_labels(grayscale_out)
      return segmentation
  
  
  if __name__ == '__main__':
      parser = argparse.ArgumentParser(description='Use this script to run human parsing using JPPNet',
                                       formatter_class=argparse.ArgumentDefaultsHelpFormatter)
      parser.add_argument('--input', '-i', required=True, help='Path to input image.')
      parser.add_argument('--model', '-m', default='lip_jppnet_384.pb', help='Path to pb model.')
      parser.add_argument('--backend', choices=backends, default=cv.dnn.DNN_BACKEND_DEFAULT, type=int,
                          help="Choose one of computation backends: "
                               "%d: automatically (by default), "
                               "%d: Intel's Deep Learning Inference Engine (https://software.intel.com/openvino-toolkit), "
                               "%d: OpenCV implementation, "
                               "%d: VKCOM, "
                               "%d: CUDA"% backends)
      parser.add_argument('--target', choices=targets, default=cv.dnn.DNN_TARGET_CPU, type=int,
                          help='Choose one of target computation devices: '
                               '%d: CPU target (by default), '
                               '%d: OpenCL, '
                               '%d: OpenCL fp16 (half-float precision), '
                               '%d: NCS2 VPU, '
                               '%d: HDDL VPU, '
                               '%d: Vulkan, '
                               '%d: CUDA, '
                               '%d: CUDA fp16 (half-float preprocess)' % targets)
      args, _ = parser.parse_known_args()
  
      if not os.path.isfile(args.model):
          raise OSError("Model not exist")
  
      image = cv.imread(args.input)
      output = parse_human(image, args.model, args.backend, args.target)
      winName = 'Deep learning human parsing in OpenCV'
      cv.namedWindow(winName, cv.WINDOW_AUTOSIZE)
      cv.imshow(winName, output)
      cv.waitKey()