Skip to content

您好,感谢您开源了您的工作,该工作十分的出色,我在测试您的模型时发现了app code部分的一些瑕疵 #17

@f-chen165

Description

@f-chen165

1、首先是数据导入组件video_input.change部分,您在process_example函数中更改了video_input会导致数据导入会不断的触发,我的解决方法如下:(思路就是不改变video_input,也不返回video_input)
video_input.change(
fn=process_example,
inputs=[
video_input,
video_caption,
target_region_frame1_caption,
point_prompt,
click_state
],
outputs=[
video_caption,
target_region_frame1_caption,
inference_state,
video_state,
video_info,
template_frame,
image_selection_slider,
track_pause_number_slider,
point_prompt,
clear_button_click,
tracking_video_predict_button,
video_output,
inpaint_video_predict_button,
run_status,
# video_input
]
)

extract frames from upload video

def get_frames_from_video(video_input, video_state):
video_path = video_input
frames = []
user_name = time.time()
vr = VideoReader(video_path)
original_fps = vr.get_avg_fps()

# If fps > 8, downsample frames to 8fps
if original_fps > 8:
    total_frames = len(vr)
    sample_interval = max(1, int(original_fps / 8))
    frame_indices = list(range(0, total_frames, sample_interval))
    frames = vr.get_batch(frame_indices).asnumpy()
else:
    frames = vr.get_batch(list(range(len(vr)))).asnumpy()

# Take only first 49 frames
frames = frames[:49]

# Resize all frames to 480x720
resized_frames = []
for frame in frames:
    resized_frame = cv2.resize(frame, (720, 480))
    resized_frames.append(resized_frame)
frames = np.array(resized_frames)

init_start = time.time() 
inference_state = predictor.init_state(images=frames, offload_video_to_cpu=True, async_loading_frames=True)
init_time = time.time() - init_start
print(f"Inference state initialization took {init_time:.2f}s")

fps = 8
image_size = (frames[0].shape[0],frames[0].shape[1])
# initialize video_state
video_state = {
    "user_name": user_name,
    "video_name": os.path.split(video_path)[-1],
    "origin_images": frames,
    "painted_images": frames.copy(),
    "masks": [np.zeros((frames[0].shape[0],frames[0].shape[1]), np.uint8)]*len(frames),
    "logits": [None]*len(frames),
    "select_frame_number": 0,
    "fps": fps,
    "ann_obj_id": 0
    }
video_info = "Video Name: {}, FPS: {}, Total Frames: {}, Image Size:{}".format(video_state["video_name"], video_state["fps"], len(frames), image_size)

video_input_tem = generate_video_from_frames(frames, output_path=f"{GRADIO_TEMP_DIR}/inpaint/{video_state['video_name']}", fps=video_state["fps"])

return gr.update(visible=True), \
gr.update(visible=True), \
inference_state, \
video_state, \
video_info, \
video_state["origin_images"][0], \
gr.update(visible=False, maximum=len(frames), value=1, interactive=True), \
gr.update(visible=False, maximum=len(frames), value=len(frames), interactive=True), \
gr.update(visible=True, interactive=True), \
gr.update(visible=True, interactive=True), \
gr.update(visible=True, interactive=True), \
gr.update(visible=True), \
gr.update(visible=True, interactive=True), \
create_status("Upload video already. Try click the image for adding targets to track and inpaint.", StatusMessage.SUCCESS), \
# video_input

2、由于第1点的修改会都是放回参数变少一个,所以在使用测试用例会报错少一个参数,我的解决方法是直接注释掉最后一个返回参数(不好意思忘记具体的code在哪了)

3、我发现Inpainting按钮在sam2处理完毕后无法正常被点击,我的解决方法是在读取数据完之后设置interactive=True;(解决code在第一点中)

Activity

f-chen165

f-chen165 commented on Mar 27, 2025

@f-chen165
Author

补充一下还有tracking_video_predict_button也会遇到报错,我的解决方法如下:

tracking video from select image and mask

tracking_video_predict_button.click(
    fn=vos_tracking_video,
    inputs=[inference_state, video_state, interactive_state, run_status],
    outputs=[
        inference_state, 
        video_output, 
        video_state, 
        interactive_state, 
        run_status,
        inpaint_video_predict_button,
        enhance_button,
        enhance_target_region_frame1_button,
        enhance_editing_instruction_button,
        # notes_accordion  # Use the accordion reference instead of string
    ]
)

tracking vos

def vos_tracking_video(inference_state, video_state, interactive_state, previous_status):
height, width = video_state["origin_images"][0].shape[0:2]

masks = []
for out_frame_idx, out_obj_ids, out_mask_logits in predictor.propagate_in_video(inference_state):
    mask = np.zeros([480, 720, 1])
    for i in range(len(out_mask_logits)):
        out_mask = out_mask_logits[i].cpu().squeeze().detach().numpy()
        out_mask[out_mask>0] = 1
        out_mask[out_mask<=0] = 0
        out_mask = out_mask[:,:,None]
        mask += out_mask
    mask = cv2.resize(mask, (width, height))
    mask = mask[:,:,None]
    mask[mask>0.5] = 1
    mask[mask<1] = 0
    mask = scipy.ndimage.binary_dilation(mask, iterations=6)
    masks.append(mask)
masks = np.array(masks)

painted_images = None
if interactive_state["track_end_number"]:
    video_state["masks"][video_state["select_frame_number"]:interactive_state["track_end_number"]] = masks
    org_images = video_state["origin_images"][video_state["select_frame_number"]:interactive_state["track_end_number"]]
    color = 255*np.ones((1, org_images.shape[-3], org_images.shape[-2], 3)) * np.array([[[[0,1,1]]]])
    painted_images = np.uint8((1-0.5*masks)*org_images + 0.5*masks*color)
    video_state["painted_images"][video_state["select_frame_number"]:interactive_state["track_end_number"]] = painted_images
else:
    video_state["masks"] = masks
    org_images = video_state["origin_images"]
    color = 255*np.ones((1, org_images.shape[-3], org_images.shape[-2], 3)) * np.array([[[[0,1,1]]]])
    painted_images = np.uint8((1-0.5*masks)*org_images + 0.5*masks*color)
    video_state["painted_images"] = painted_images
if painted_images is not None:
    video_output = generate_video_from_frames(video_state["painted_images"], output_path=f"{GRADIO_TEMP_DIR}/track/{video_state['video_name']}", fps=video_state["fps"])
else:
    raise ValueError("No tracking images found")
interactive_state["inference_times"] += 1

print(f"func-vos_tracking_video: {video_output}")
return (
    inference_state, 
    video_output, 
    video_state, 
    interactive_state, 
    update_status(previous_status, "Track the selected target region, and then you can use the masks for inpainting.", StatusMessage.SUCCESS),
    gr.Button.update(visible=True, interactive=True),  # inpaint_video_predict_button
    gr.Button.update(visible=True, interactive=True),  # enhance_button
    gr.Button.update(visible=True, interactive=True),  # enhance_target_region_frame1_button
    gr.Button.update(visible=True, interactive=True),  # enhance_editing_instruction_button
    # gr.Accordion(label="My Accordion", open=True)  # Add this line to open the accordion
)
yxbian23

yxbian23 commented on Apr 8, 2025

@yxbian23
Collaborator

1、首先是数据导入组件video_input.change部分,您在process_example函数中更改了video_input会导致数据导入会不断的触发,我的解决方法如下:(思路就是不改变video_input,也不返回video_input) video_input.change( fn=process_example, inputs=[ video_input, video_caption, target_region_frame1_caption, point_prompt, click_state ], outputs=[ video_caption, target_region_frame1_caption, inference_state, video_state, video_info, template_frame, image_selection_slider, track_pause_number_slider, point_prompt, clear_button_click, tracking_video_predict_button, video_output, inpaint_video_predict_button, run_status, # video_input ] )

extract frames from upload video

def get_frames_from_video(video_input, video_state): video_path = video_input frames = [] user_name = time.time() vr = VideoReader(video_path) original_fps = vr.get_avg_fps()

# If fps > 8, downsample frames to 8fps
if original_fps > 8:
    total_frames = len(vr)
    sample_interval = max(1, int(original_fps / 8))
    frame_indices = list(range(0, total_frames, sample_interval))
    frames = vr.get_batch(frame_indices).asnumpy()
else:
    frames = vr.get_batch(list(range(len(vr)))).asnumpy()

# Take only first 49 frames
frames = frames[:49]

# Resize all frames to 480x720
resized_frames = []
for frame in frames:
    resized_frame = cv2.resize(frame, (720, 480))
    resized_frames.append(resized_frame)
frames = np.array(resized_frames)

init_start = time.time() 
inference_state = predictor.init_state(images=frames, offload_video_to_cpu=True, async_loading_frames=True)
init_time = time.time() - init_start
print(f"Inference state initialization took {init_time:.2f}s")

fps = 8
image_size = (frames[0].shape[0],frames[0].shape[1])
# initialize video_state
video_state = {
    "user_name": user_name,
    "video_name": os.path.split(video_path)[-1],
    "origin_images": frames,
    "painted_images": frames.copy(),
    "masks": [np.zeros((frames[0].shape[0],frames[0].shape[1]), np.uint8)]*len(frames),
    "logits": [None]*len(frames),
    "select_frame_number": 0,
    "fps": fps,
    "ann_obj_id": 0
    }
video_info = "Video Name: {}, FPS: {}, Total Frames: {}, Image Size:{}".format(video_state["video_name"], video_state["fps"], len(frames), image_size)

video_input_tem = generate_video_from_frames(frames, output_path=f"{GRADIO_TEMP_DIR}/inpaint/{video_state['video_name']}", fps=video_state["fps"])

return gr.update(visible=True), \
gr.update(visible=True), \
inference_state, \
video_state, \
video_info, \
video_state["origin_images"][0], \
gr.update(visible=False, maximum=len(frames), value=1, interactive=True), \
gr.update(visible=False, maximum=len(frames), value=len(frames), interactive=True), \
gr.update(visible=True, interactive=True), \
gr.update(visible=True, interactive=True), \
gr.update(visible=True, interactive=True), \
gr.update(visible=True), \
gr.update(visible=True, interactive=True), \
create_status("Upload video already. Try click the image for adding targets to track and inpaint.", StatusMessage.SUCCESS), \
# video_input

2、由于第1点的修改会都是放回参数变少一个,所以在使用测试用例会报错少一个参数,我的解决方法是直接注释掉最后一个返回参数(不好意思忘记具体的code在哪了)

3、我发现Inpainting按钮在sam2处理完毕后无法正常被点击,我的解决方法是在读取数据完之后设置interactive=True;(解决code在第一点中)

Thank you for your interest in our work! We greatly appreciate your suggestions!

If you have time, could you submit a PR? This would ensure that I can apply your optimizations exactly as intended. If not, I will review and implement the optimizations you mentioned at a later time.

f-chen165

f-chen165 commented on Apr 9, 2025

@f-chen165
Author

1、首先是数据导入组件video_input.change部分,您在process_example函数中更改了video_input会导致数据导入会不断的触发,我的解决方法如下:(思路就是不改变video_input,也不返回video_input) video_input.change( fn=process_example, inputs=[ video_input, video_caption, target_region_frame1_caption, point_prompt, click_state ], outputs=[ video_caption, target_region_frame1_caption, inference_state, video_state, video_info, template_frame, image_selection_slider, track_pause_number_slider, point_prompt, clear_button_click, tracking_video_predict_button, video_output, inpaint_video_predict_button, run_status, # video_input ] )

extract frames from upload video

def get_frames_from_video(video_input, video_state): video_path = video_input frames = [] user_name = time.time() vr = VideoReader(video_path) original_fps = vr.get_avg_fps()

# If fps > 8, downsample frames to 8fps
if original_fps > 8:
    total_frames = len(vr)
    sample_interval = max(1, int(original_fps / 8))
    frame_indices = list(range(0, total_frames, sample_interval))
    frames = vr.get_batch(frame_indices).asnumpy()
else:
    frames = vr.get_batch(list(range(len(vr)))).asnumpy()

# Take only first 49 frames
frames = frames[:49]

# Resize all frames to 480x720
resized_frames = []
for frame in frames:
    resized_frame = cv2.resize(frame, (720, 480))
    resized_frames.append(resized_frame)
frames = np.array(resized_frames)

init_start = time.time() 
inference_state = predictor.init_state(images=frames, offload_video_to_cpu=True, async_loading_frames=True)
init_time = time.time() - init_start
print(f"Inference state initialization took {init_time:.2f}s")

fps = 8
image_size = (frames[0].shape[0],frames[0].shape[1])
# initialize video_state
video_state = {
    "user_name": user_name,
    "video_name": os.path.split(video_path)[-1],
    "origin_images": frames,
    "painted_images": frames.copy(),
    "masks": [np.zeros((frames[0].shape[0],frames[0].shape[1]), np.uint8)]*len(frames),
    "logits": [None]*len(frames),
    "select_frame_number": 0,
    "fps": fps,
    "ann_obj_id": 0
    }
video_info = "Video Name: {}, FPS: {}, Total Frames: {}, Image Size:{}".format(video_state["video_name"], video_state["fps"], len(frames), image_size)

video_input_tem = generate_video_from_frames(frames, output_path=f"{GRADIO_TEMP_DIR}/inpaint/{video_state['video_name']}", fps=video_state["fps"])

return gr.update(visible=True), \
gr.update(visible=True), \
inference_state, \
video_state, \
video_info, \
video_state["origin_images"][0], \
gr.update(visible=False, maximum=len(frames), value=1, interactive=True), \
gr.update(visible=False, maximum=len(frames), value=len(frames), interactive=True), \
gr.update(visible=True, interactive=True), \
gr.update(visible=True, interactive=True), \
gr.update(visible=True, interactive=True), \
gr.update(visible=True), \
gr.update(visible=True, interactive=True), \
create_status("Upload video already. Try click the image for adding targets to track and inpaint.", StatusMessage.SUCCESS), \
# video_input

2、由于第1点的修改会都是放回参数变少一个,所以在使用测试用例会报错少一个参数,我的解决方法是直接注释掉最后一个返回参数(不好意思忘记具体的code在哪了)
3、我发现Inpainting按钮在sam2处理完毕后无法正常被点击,我的解决方法是在读取数据完之后设置interactive=True;(解决code在第一点中)

Thank you for your interest in our work! We greatly appreciate your suggestions!

If you have time, could you submit a PR? This would ensure that I can apply your optimizations exactly as intended. If not, I will review and implement the optimizations you mentioned at a later time.

实在抱歉我在后续的测试中对code对改动比较大,现在以及无法回滚到想要的版本了

yxbian23

yxbian23 commented on Apr 9, 2025

@yxbian23
Collaborator

1、首先是数据导入组件video_input.change部分,您在process_example函数中更改了video_input会导致数据导入会不断的触发,我的解决方法如下:(思路就是不改变video_input,也不返回video_input) video_input.change( fn=process_example, inputs=[ video_input, video_caption, target_region_frame1_caption, point_prompt, click_state ], outputs=[ video_caption, target_region_frame1_caption, inference_state, video_state, video_info, template_frame, image_selection_slider, track_pause_number_slider, point_prompt, clear_button_click, tracking_video_predict_button, video_output, inpaint_video_predict_button, run_status, # video_input ] )

extract frames from upload video

def get_frames_from_video(video_input, video_state): video_path = video_input frames = [] user_name = time.time() vr = VideoReader(video_path) original_fps = vr.get_avg_fps()

# If fps > 8, downsample frames to 8fps
if original_fps > 8:
    total_frames = len(vr)
    sample_interval = max(1, int(original_fps / 8))
    frame_indices = list(range(0, total_frames, sample_interval))
    frames = vr.get_batch(frame_indices).asnumpy()
else:
    frames = vr.get_batch(list(range(len(vr)))).asnumpy()

# Take only first 49 frames
frames = frames[:49]

# Resize all frames to 480x720
resized_frames = []
for frame in frames:
    resized_frame = cv2.resize(frame, (720, 480))
    resized_frames.append(resized_frame)
frames = np.array(resized_frames)

init_start = time.time() 
inference_state = predictor.init_state(images=frames, offload_video_to_cpu=True, async_loading_frames=True)
init_time = time.time() - init_start
print(f"Inference state initialization took {init_time:.2f}s")

fps = 8
image_size = (frames[0].shape[0],frames[0].shape[1])
# initialize video_state
video_state = {
    "user_name": user_name,
    "video_name": os.path.split(video_path)[-1],
    "origin_images": frames,
    "painted_images": frames.copy(),
    "masks": [np.zeros((frames[0].shape[0],frames[0].shape[1]), np.uint8)]*len(frames),
    "logits": [None]*len(frames),
    "select_frame_number": 0,
    "fps": fps,
    "ann_obj_id": 0
    }
video_info = "Video Name: {}, FPS: {}, Total Frames: {}, Image Size:{}".format(video_state["video_name"], video_state["fps"], len(frames), image_size)

video_input_tem = generate_video_from_frames(frames, output_path=f"{GRADIO_TEMP_DIR}/inpaint/{video_state['video_name']}", fps=video_state["fps"])

return gr.update(visible=True), \
gr.update(visible=True), \
inference_state, \
video_state, \
video_info, \
video_state["origin_images"][0], \
gr.update(visible=False, maximum=len(frames), value=1, interactive=True), \
gr.update(visible=False, maximum=len(frames), value=len(frames), interactive=True), \
gr.update(visible=True, interactive=True), \
gr.update(visible=True, interactive=True), \
gr.update(visible=True, interactive=True), \
gr.update(visible=True), \
gr.update(visible=True, interactive=True), \
create_status("Upload video already. Try click the image for adding targets to track and inpaint.", StatusMessage.SUCCESS), \
# video_input

2、由于第1点的修改会都是放回参数变少一个,所以在使用测试用例会报错少一个参数,我的解决方法是直接注释掉最后一个返回参数(不好意思忘记具体的code在哪了)
3、我发现Inpainting按钮在sam2处理完毕后无法正常被点击,我的解决方法是在读取数据完之后设置interactive=True;(解决code在第一点中)

Thank you for your interest in our work! We greatly appreciate your suggestions!
If you have time, could you submit a PR? This would ensure that I can apply your optimizations exactly as intended. If not, I will review and implement the optimizations you mentioned at a later time.

实在抱歉我在后续的测试中对code对改动比较大,现在以及无法回滚到想要的版本了

Oh! Still big thanks for you! I will review all the above you mentioned and optimize the current app code!😄

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

      Development

      No branches or pull requests

        Participants

        @f-chen165@yxbian23

        Issue actions

          您好,感谢您开源了您的工作,该工作十分的出色,我在测试您的模型时发现了app code部分的一些瑕疵 · Issue #17 · TencentARC/VideoPainter