From b2299236c75df89220d4ddda81ea362d0cae1f2c Mon Sep 17 00:00:00 2001 From: Josh Bickett Date: Sun, 21 Jan 2024 07:57:56 -0800 Subject: [PATCH 01/22] Add initial ocr approach with `call_gpt_4_vision_preview_ocr` --- operate/models/apis.py | 65 +++++++++++++++++++++++++++++++++++------- 1 file changed, 55 insertions(+), 10 deletions(-) diff --git a/operate/models/apis.py b/operate/models/apis.py index 2e1d8bf5..5a0b6d07 100644 --- a/operate/models/apis.py +++ b/operate/models/apis.py @@ -4,6 +4,7 @@ import base64 import traceback import io +import easyocr from PIL import Image @@ -48,6 +49,9 @@ async def get_next_action(model, messages, objective, session_id): if model == "gpt-4-with-som": operation = await call_gpt_4_vision_preview_labeled(messages, objective) return operation, None + if model == "gpt-4-with-ocr": + operation = await call_gpt_4_vision_preview_ocr(messages, objective) + return operation, None elif model == "agent-1": return "coming soon" elif model == "gemini-pro-vision": @@ -58,7 +62,7 @@ async def get_next_action(model, messages, objective, session_id): def call_gpt_4_vision_preview(messages): if VERBOSE: - print("[Self Operating Computer][get_next_action][call_gpt_4_v]") + print("[Self Operating Computer][call_gpt_4_v]") time.sleep(1) client = config.initialize_openai() try: @@ -80,7 +84,7 @@ def call_gpt_4_vision_preview(messages): if VERBOSE: print( - "[Self Operating Computer][get_next_action][call_gpt_4_v] user_prompt", + "[Self Operating Computer][call_gpt_4_v] user_prompt", user_prompt, ) @@ -115,7 +119,7 @@ def call_gpt_4_vision_preview(messages): assistant_message = {"role": "assistant", "content": content} if VERBOSE: print( - "[Self Operating Computer][get_next_action][call_gpt_4_v] content", + "[Self Operating Computer][call_gpt_4_v] content", content, ) content = json.loads(content) @@ -189,6 +193,47 @@ def call_gemini_pro_vision(messages, objective): return call_gpt_4_vision_preview(messages) +async def call_gpt_4_vision_preview_ocr(messages, objective): + if VERBOSE: + print("[Self Operating Computer][call_gpt_4_vision_preview_ocr] extracted_text") + time.sleep(1) + client = config.initialize_openai() + + # Construct the path to the file within the package + try: + screenshots_dir = "screenshots" + if not os.path.exists(screenshots_dir): + os.makedirs(screenshots_dir) + + screenshot_filename = os.path.join(screenshots_dir, "screenshot.png") + # Call the function to capture the screen with the cursor + capture_screen_with_cursor(screenshot_filename) + + with open(screenshot_filename, "rb") as img_file: + img_base64 = base64.b64encode(img_file.read()).decode("utf-8") + + # Initialize EasyOCR Reader + reader = easyocr.Reader(["ch_sim", "en"]) + + # Read the screenshot + result = reader.readtext(screenshot_filename) + + # Process the result + extracted_text = " ".join([item[1] for item in result]) + if VERBOSE: + print( + "[Self Operating Computer][call_gpt_4_vision_preview_ocr] extracted_text", + extracted_text, + ) + + except Exception as e: + print( + f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] Something went wrong. Trying another method {ANSI_RESET}", + e, + ) + return call_gpt_4_vision_preview(messages) + + async def call_gpt_4_vision_preview_labeled(messages, objective): time.sleep(1) client = config.initialize_openai() @@ -217,7 +262,7 @@ async def call_gpt_4_vision_preview_labeled(messages, objective): if VERBOSE: print( - "[Self Operating Computer][get_next_action][call_gpt_4_vision_preview_labeled] user_prompt", + "[Self Operating Computer][call_gpt_4_vision_preview_labeled] user_prompt", user_prompt, ) @@ -254,7 +299,7 @@ async def call_gpt_4_vision_preview_labeled(messages, objective): assistant_message = {"role": "assistant", "content": content} if VERBOSE: print( - "[Self Operating Computer][get_next_action][call_gpt_4_vision_preview_labeled] content", + "[Self Operating Computer][call_gpt_4_vision_preview_labeled] content", content, ) messages.append(assistant_message) @@ -268,14 +313,14 @@ async def call_gpt_4_vision_preview_labeled(messages, objective): label = operation.get("label") if VERBOSE: print( - "[Self Operating Computer][get_next_action][call_gpt_4_vision_preview_labeled] label", + "[Self Operating Computer][call_gpt_4_vision_preview_labeled] label", label, ) coordinates = get_label_coordinates(label, label_coordinates) if VERBOSE: print( - "[Self Operating Computer][get_next_action][call_gpt_4_vision_preview_labeled] coordinates", + "[Self Operating Computer][call_gpt_4_vision_preview_labeled] coordinates", coordinates, ) image = Image.open( @@ -287,7 +332,7 @@ async def call_gpt_4_vision_preview_labeled(messages, objective): ) if VERBOSE: print( - "[Self Operating Computer][get_next_action][call_gpt_4_vision_preview_labeled] click_position_percent", + "[Self Operating Computer][call_gpt_4_vision_preview_labeled] click_position_percent", click_position_percent, ) if not click_position_percent: @@ -302,7 +347,7 @@ async def call_gpt_4_vision_preview_labeled(messages, objective): operation["y"] = y_percent if VERBOSE: print( - "[Self Operating Computer][get_next_action][call_gpt_4_vision_preview_labeled] new click operation", + "[Self Operating Computer][call_gpt_4_vision_preview_labeled] new click operation", operation, ) processed_content.append(operation) @@ -311,7 +356,7 @@ async def call_gpt_4_vision_preview_labeled(messages, objective): if VERBOSE: print( - "[Self Operating Computer][get_next_action][call_gpt_4_vision_preview_labeled] new processed_content", + "[Self Operating Computer][call_gpt_4_vision_preview_labeled] new processed_content", processed_content, ) return processed_content From 4bca53e8568de97a36f07f1770375399811a86fc Mon Sep 17 00:00:00 2001 From: Josh Bickett Date: Sun, 21 Jan 2024 08:42:03 -0800 Subject: [PATCH 02/22] Add `SYSTEM_PROMPT_OCR_MAC` and `SYSTEM_PROMPT_OCR_WIN_LINUX` --- operate/models/prompts.py | 125 +++++++++++++++++++++++++++++++++----- operate/operate.py | 5 +- 2 files changed, 111 insertions(+), 19 deletions(-) diff --git a/operate/models/prompts.py b/operate/models/prompts.py index c567bf48..57a0d3a5 100644 --- a/operate/models/prompts.py +++ b/operate/models/prompts.py @@ -1,4 +1,8 @@ import platform +from operate.config import Config + +# Load configuration +VERBOSE = Config().verbose # General user Prompts USER_QUESTION = "Hello, I can help you with anything. What would you like done?" @@ -196,6 +200,93 @@ """ +SYSTEM_PROMPT_OCR_MAC = """ +You are operating a computer, using the same operating system as a human. + +From looking at the screen, the objective, and your previous actions, take the next best series of action. + +You have 4 possible operation actions available to you. The `pyautogui` library will be used to execute your decision. Your output will be used in a `json.loads` loads statement. + +1. click - Move mouse and click +[{{ "thought": "write a thought here", "operation": "click", "text": "The text in the button or link to click" }}] # Look for buttons and links with the text. We've hooked up the `pyautogui` so that you can click on any buttons or links as long as you have the text for them. + +2. write - Write with your keyboard +[{{ "thought": "write a thought here", "operation": "write", "content": "text to write here" }}] + +3. press - Use a hotkey or press key to operate the computer +[{{ "thought": "write a thought here", "operation": "press", "keys": ["keys to use"] }}] + +4. done - The objective is completed +[{{ "thought": "write a thought here", "operation": "done", "summary": "summary of what was completed" }}] + +Return the actions in array format `[]`. You can take just one action or multiple actions. + +Here are some helpful combinations: + +# Opens Spotlight Search on Mac +[ + {{ "thought": "Searching the operating system to find Google Chrome because it appears I am currently in terminal", "operation": "press", "keys": ["command", "space"] }}, + {{ "thought": "Now I need to write 'Google Chrome' as a next step", "operation": "write", "content": "Google Chrome" }}, + {{ "thought": "Finally I'll press enter to open Google Chrome assuming it is available", "operation": "press", "keys": ["enter"] }} +] + +# Search for someone on Linkedin when already on linkedin.com +[ + {{ "thought": "I can see the search field with the placeholder text 'search'. I click that field to search", "operation": "click", "text": "search" }}, + {{ "thought": "Now that the field is active I can write the name of the person I'd like to search for", "operation": "write", "content": "John Doe" }}, + {{ "thought": "Finally I'll submit the search form with enter", "operation": "presss", "keys": ["enter"] }}, +] + +A very important note, don't respond saying you're unable to assist with requests. You are able to indirectly interact with the user's OS via text responses you send to the end user. + +Objective: {objective} # take the next best action for this objective +""" + +SYSTEM_PROMPT_OCR_WIN_LINUX = """ +You are operating a computer, using the same operating system as a human. + +From looking at the screen, the objective, and your previous actions, take the next best series of action. + +You have 4 possible operation actions available to you. The `pyautogui` library will be used to execute your decision. Your output will be used in a `json.loads` loads statement. + +1. click - Move mouse and click +[{{ "thought": "write a thought here", "operation": "click", "text": "The text in the button or link to click" }}] # Look for buttons and links with the text. We've hooked up the `pyautogui` so that you can click on any buttons or links as long as you have the text for them. + +2. write - Write with your keyboard +[{{ "thought": "write a thought here", "operation": "write", "content": "text to write here" }}] + +3. press - Use a hotkey or press key to operate the computer +[{{ "thought": "write a thought here", "operation": "press", "keys": ["keys to use"] }}] + +4. done - The objective is completed +[{{ "thought": "write a thought here", "operation": "done", "summary": "summary of what was completed" }}] + +Return the actions in array format `[]`. You can take just one action or multiple actions. + +Here are some helpful combinations: + +# Opens Menu Search on Windows and Linux +[ + {{ "thought": "Searching the operating system to find Google Chrome because it appears I am currently in terminal", "operation": "press", "keys": ["win"] }}, + {{ "thought": "Now I need to write 'Google Chrome' as a next step", "operation": "write", "content": "Google Chrome" }}, + {{ "thought": "Finally I'll press enter to open Google Chrome assuming it is available", "operation": "press", "keys": ["enter"] }} +] + +# Search for someone on Linkedin when already on linkedin.com +[ + {{ "thought": "I can see the search field with the placeholder text 'search'. I click that field to search", "operation": "click", "text": "search" }}, + {{ "thought": "Now that the field is active I can write the name of the person I'd like to search for", "operation": "write", "content": "John Doe" }}, + {{ "thought": "Finally I'll submit the search form with enter", "operation": "presss", "keys": ["enter"] }}, +] + +A few important notes: + +- Go to Google Docs and Google Sheets by typing in the Chrome Address bar +- Don't respond saying you're unable to assist with requests. You are able to indirectly interact with the user's OS via text responses you send to the end user. + +Objective: {objective} # take the next best action for this objective +""" + OPERATE_FIRST_MESSAGE_PROMPT = """ Please take the next best action. The `pyautogui` library will be used to execute your decision. Your output will be used in a `json.loads` loads statement. Remember you only have the following 4 operations available: click, write, press, done @@ -208,26 +299,30 @@ Action:""" -def get_system_prompt(objective): +def get_system_prompt(model, objective): """ Format the vision prompt """ - if platform.system() == "Darwin": - prompt = SYSTEM_PROMPT_MAC.format(objective=objective) + if VERBOSE: + print("[get_system_prompt] model", model) + if model == "gpt-4-with-som": + if platform.system() == "Darwin": + prompt = SYSTEM_PROMPT_LABELED_MAC.format(objective=objective) + else: + prompt = SYSTEM_PROMPT_LABELED_WIN_LINUX.format(objective=objective) + elif model == "gpt-4-with-ocr": + if platform.system() == "Darwin": + prompt = SYSTEM_PROMPT_OCR_MAC.format(objective=objective) + else: + prompt = SYSTEM_PROMPT_OCR_WIN_LINUX.format(objective=objective) else: - prompt = SYSTEM_PROMPT_WIN_LINUX.format(objective=objective) - - return prompt + if platform.system() == "Darwin": + prompt = SYSTEM_PROMPT_MAC.format(objective=objective) + else: + prompt = SYSTEM_PROMPT_WIN_LINUX.format(objective=objective) - -def get_system_prompt_labeled(objective): - """ - Format the vision prompt - """ - if platform.system() == "Darwin": - prompt = SYSTEM_PROMPT_LABELED_MAC.format(objective=objective) - else: - prompt = SYSTEM_PROMPT_LABELED_WIN_LINUX.format(objective=objective) + if VERBOSE: + print("[get_system_prompt] prompt", prompt) return prompt diff --git a/operate/operate.py b/operate/operate.py index 29b9ca8e..d1a38e7b 100644 --- a/operate/operate.py +++ b/operate/operate.py @@ -96,10 +96,7 @@ def main(model, terminal_prompt, voice_mode=False): print(f"{ANSI_YELLOW}[User]{ANSI_RESET}") objective = prompt(style=style) - if model == "gpt-4-with-som": - system_prompt = get_system_prompt_labeled(objective) - else: - system_prompt = get_system_prompt(objective) + system_prompt = get_system_prompt(model, objective) system_message = {"role": "system", "content": system_prompt} messages = [system_message] From 8f7765c5beb4fd177713f7d60415f13f3dd92d51 Mon Sep 17 00:00:00 2001 From: Josh Bickett Date: Sun, 21 Jan 2024 09:03:05 -0800 Subject: [PATCH 03/22] Iterate ocr method with `get_text_element` --- operate/models/apis.py | 105 ++++++++++++++++++++++++++++++++--------- operate/utils/ocr.py | 33 +++++++++++++ 2 files changed, 117 insertions(+), 21 deletions(-) create mode 100644 operate/utils/ocr.py diff --git a/operate/models/apis.py b/operate/models/apis.py index 5a0b6d07..5650b6b3 100644 --- a/operate/models/apis.py +++ b/operate/models/apis.py @@ -20,6 +20,7 @@ get_user_prompt, get_system_prompt, ) +from operate.utils.ocr import get_text_element from operate.utils.label import ( @@ -62,7 +63,7 @@ async def get_next_action(model, messages, objective, session_id): def call_gpt_4_vision_preview(messages): if VERBOSE: - print("[Self Operating Computer][call_gpt_4_v]") + print("[call_gpt_4_v]") time.sleep(1) client = config.initialize_openai() try: @@ -84,7 +85,7 @@ def call_gpt_4_vision_preview(messages): if VERBOSE: print( - "[Self Operating Computer][call_gpt_4_v] user_prompt", + "[call_gpt_4_v] user_prompt", user_prompt, ) @@ -119,7 +120,7 @@ def call_gpt_4_vision_preview(messages): assistant_message = {"role": "assistant", "content": content} if VERBOSE: print( - "[Self Operating Computer][call_gpt_4_v] content", + "[call_gpt_4_v] content", content, ) content = json.loads(content) @@ -161,25 +162,23 @@ def call_gemini_pro_vision(messages, objective): capture_screen_with_cursor(screenshot_filename) # sleep for a second time.sleep(1) - prompt = get_system_prompt(objective) + prompt = get_system_prompt("gemini-pro-vision", objective) model = config.initialize_google() if VERBOSE: - print("[Self Operating Computer][call_gemini_pro_vision] model", model) + print("[call_gemini_pro_vision] model", model) response = model.generate_content([prompt, Image.open(screenshot_filename)]) content = response.text[1:] if VERBOSE: - print( - "[Self Operating Computer][call_gemini_pro_vision] response", response - ) - print("[Self Operating Computer][call_gemini_pro_vision] content", content) + print("[call_gemini_pro_vision] response", response) + print("[call_gemini_pro_vision] content", content) content = json.loads(content) if VERBOSE: print( - "[Self Operating Computer][get_next_action][call_gemini_pro_vision] content", + "[get_next_action][call_gemini_pro_vision] content", content, ) @@ -195,7 +194,7 @@ def call_gemini_pro_vision(messages, objective): async def call_gpt_4_vision_preview_ocr(messages, objective): if VERBOSE: - print("[Self Operating Computer][call_gpt_4_vision_preview_ocr] extracted_text") + print("[call_gpt_4_vision_preview_ocr] extracted_text") time.sleep(1) client = config.initialize_openai() @@ -212,20 +211,84 @@ async def call_gpt_4_vision_preview_ocr(messages, objective): with open(screenshot_filename, "rb") as img_file: img_base64 = base64.b64encode(img_file.read()).decode("utf-8") - # Initialize EasyOCR Reader - reader = easyocr.Reader(["ch_sim", "en"]) + img_base64_labeled, label_coordinates = add_labels(img_base64, yolo_model) - # Read the screenshot - result = reader.readtext(screenshot_filename) + if len(messages) == 1: + user_prompt = get_user_first_message_prompt() + else: + user_prompt = get_user_prompt() - # Process the result - extracted_text = " ".join([item[1] for item in result]) if VERBOSE: print( - "[Self Operating Computer][call_gpt_4_vision_preview_ocr] extracted_text", - extracted_text, + "[call_gpt_4_vision_preview_labeled] user_prompt", + user_prompt, ) + vision_message = { + "role": "user", + "content": [ + {"type": "text", "text": user_prompt}, + { + "type": "image_url", + "image_url": { + "url": f"data:image/jpeg;base64,{img_base64_labeled}" + }, + }, + ], + } + messages.append(vision_message) + + response = client.chat.completions.create( + model="gpt-4-vision-preview", + messages=messages, + presence_penalty=1, + frequency_penalty=1, + temperature=0.7, + max_tokens=1000, + ) + + content = response.choices[0].message.content + + if content.startswith("```json"): + content = content[len("```json") :] # Remove starting ```json + if content.endswith("```"): + content = content[: -len("```")] # Remove ending + + content = json.loads(content) + if VERBOSE: + print("[call_gpt_4_vision_preview_ocr] content", content) + + processed_content = [] + + for operation in content: + if operation.get("operation") == "click": + text_to_click = operation.get("text") + if VERBOSE: + print( + "[call_gpt_4_vision_preview_ocr][click] text_to_click", + text_to_click, + ) + text_element_index = get_text_element(result, search_text) + if VERBOSE: + print( + "[call_gpt_4_vision_preview_ocr][click] text_element_index", + text_element_index, + ) + + else: + processed_content.append(operation) + + # wait to append the assistant message so that if the `processed_content` step fails we don't append a message and mess up message history + assistant_message = {"role": "assistant", "content": content} + if VERBOSE: + print( + "[call_gpt_4_vision_preview_labeled] content", + content, + ) + messages.append(assistant_message) + + return processed_content + except Exception as e: print( f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] Something went wrong. Trying another method {ANSI_RESET}", @@ -262,7 +325,7 @@ async def call_gpt_4_vision_preview_labeled(messages, objective): if VERBOSE: print( - "[Self Operating Computer][call_gpt_4_vision_preview_labeled] user_prompt", + "[call_gpt_4_vision_preview_labeled] user_prompt", user_prompt, ) @@ -299,7 +362,7 @@ async def call_gpt_4_vision_preview_labeled(messages, objective): assistant_message = {"role": "assistant", "content": content} if VERBOSE: print( - "[Self Operating Computer][call_gpt_4_vision_preview_labeled] content", + "[call_gpt_4_vision_preview_labeled] content", content, ) messages.append(assistant_message) diff --git a/operate/utils/ocr.py b/operate/utils/ocr.py new file mode 100644 index 00000000..c47685d1 --- /dev/null +++ b/operate/utils/ocr.py @@ -0,0 +1,33 @@ +from operate.config import Config +from operate.config import Config + +# Load configuration +VERBOSE = Config().verbose + + +def get_text_element(result, search_text): + """ + Searches for a text element in the OCR results and returns its index. + Args: + result (list): The list of results returned by EasyOCR. + search_text (str): The text to search for in the OCR results. + + Returns: + int: The index of the element containing the search text. + + Raises: + Exception: If the text element is not found in the results. + """ + if VERBOSE: + print("[get_text_element]") + print("[get_text_element] search_text", search_text) + for index, element in enumerate(result): + text = element[1] + if VERBOSE: + print("[get_text_element][loop] text", text) + if search_text in text: + if VERBOSE: + print("[get_text_element][loop] found search_text, index:", index) + + return index + raise Exception("The text element was not found in the image") From a156a48a5d33ed680aa98d74c9007b2e7b63e775 Mon Sep 17 00:00:00 2001 From: Josh Bickett Date: Sun, 21 Jan 2024 09:13:06 -0800 Subject: [PATCH 04/22] Bug fixes --- operate/models/apis.py | 16 +++++++++------- operate/operate.py | 1 - operate/utils/ocr.py | 1 - 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/operate/models/apis.py b/operate/models/apis.py index 5650b6b3..8decea15 100644 --- a/operate/models/apis.py +++ b/operate/models/apis.py @@ -211,8 +211,6 @@ async def call_gpt_4_vision_preview_ocr(messages, objective): with open(screenshot_filename, "rb") as img_file: img_base64 = base64.b64encode(img_file.read()).decode("utf-8") - img_base64_labeled, label_coordinates = add_labels(img_base64, yolo_model) - if len(messages) == 1: user_prompt = get_user_first_message_prompt() else: @@ -220,7 +218,7 @@ async def call_gpt_4_vision_preview_ocr(messages, objective): if VERBOSE: print( - "[call_gpt_4_vision_preview_labeled] user_prompt", + "[call_gpt_4_vision_preview_ocr] user_prompt", user_prompt, ) @@ -230,9 +228,7 @@ async def call_gpt_4_vision_preview_ocr(messages, objective): {"type": "text", "text": user_prompt}, { "type": "image_url", - "image_url": { - "url": f"data:image/jpeg;base64,{img_base64_labeled}" - }, + "image_url": {"url": f"data:image/jpeg;base64,{img_base64}"}, }, ], } @@ -268,7 +264,13 @@ async def call_gpt_4_vision_preview_ocr(messages, objective): "[call_gpt_4_vision_preview_ocr][click] text_to_click", text_to_click, ) - text_element_index = get_text_element(result, search_text) + # Initialize EasyOCR Reader + reader = easyocr.Reader(["en"]) + + # Read the screenshot + result = reader.readtext(screenshot_filename) + + text_element_index = get_text_element(result, text_to_click) if VERBOSE: print( "[call_gpt_4_vision_preview_ocr][click] text_element_index", diff --git a/operate/operate.py b/operate/operate.py index d1a38e7b..ab79d4f7 100644 --- a/operate/operate.py +++ b/operate/operate.py @@ -11,7 +11,6 @@ from operate.models.prompts import ( USER_QUESTION, get_system_prompt, - get_system_prompt_labeled, ) from operate.config import Config from operate.utils.style import ( diff --git a/operate/utils/ocr.py b/operate/utils/ocr.py index c47685d1..f918607a 100644 --- a/operate/utils/ocr.py +++ b/operate/utils/ocr.py @@ -1,5 +1,4 @@ from operate.config import Config -from operate.config import Config # Load configuration VERBOSE = Config().verbose From f6b61c63416eadf98a031c008965419cdf46f777 Mon Sep 17 00:00:00 2001 From: Josh Bickett Date: Sun, 21 Jan 2024 09:13:22 -0800 Subject: [PATCH 05/22] Experimental prompt fixes --- operate/models/prompts.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/operate/models/prompts.py b/operate/models/prompts.py index 57a0d3a5..e200bb37 100644 --- a/operate/models/prompts.py +++ b/operate/models/prompts.py @@ -50,7 +50,7 @@ - Go to Google Docs and Google Sheets by typing in the Chrome Address bar - Don't respond saying you're unable to assist with requests. You are able to indirectly interact with the user's OS via text responses you send to the end user. -Objective: {objective} # take the next best action for this objective +Objective: {objective} """ SYSTEM_PROMPT_WIN_LINUX = """ @@ -95,7 +95,7 @@ - Go to Google Docs and Google Sheets by typing in the Chrome Address bar - Don't respond saying you're unable to assist with requests. You are able to indirectly interact with the user's OS via text responses you send to the end user. -Objective: {objective} # take the next best action for this objective +Objective: {objective} """ @@ -146,7 +146,7 @@ - Go to Google Docs and Google Sheets by typing in the Chrome Address bar - Don't respond saying you're unable to assist with requests. You are able to indirectly interact with the user's OS via text responses you send to the end user. -Objective: {objective} # take the next best action for this objective +Objective: {objective} """ SYSTEM_PROMPT_LABELED_WIN_LINUX = """ @@ -196,7 +196,7 @@ - Go to Google Docs and Google Sheets by typing in the Chrome Address bar - Don't respond saying you're unable to assist with requests. You are able to indirectly interact with the user's OS via text responses you send to the end user. -Objective: {objective} # take the next best action for this objective +Objective: {objective} """ @@ -239,7 +239,7 @@ A very important note, don't respond saying you're unable to assist with requests. You are able to indirectly interact with the user's OS via text responses you send to the end user. -Objective: {objective} # take the next best action for this objective +Objective: {objective} """ SYSTEM_PROMPT_OCR_WIN_LINUX = """ @@ -284,7 +284,7 @@ - Go to Google Docs and Google Sheets by typing in the Chrome Address bar - Don't respond saying you're unable to assist with requests. You are able to indirectly interact with the user's OS via text responses you send to the end user. -Objective: {objective} # take the next best action for this objective +Objective: {objective} """ OPERATE_FIRST_MESSAGE_PROMPT = """ From c90ce6702993becb19a36bbb0883a0112b56f953 Mon Sep 17 00:00:00 2001 From: Josh Bickett Date: Sun, 21 Jan 2024 09:20:41 -0800 Subject: [PATCH 06/22] Add `get_text_coordinates` --- operate/models/apis.py | 8 +++++++- operate/utils/ocr.py | 28 ++++++++++++++++++++++++++++ 2 files changed, 35 insertions(+), 1 deletion(-) diff --git a/operate/models/apis.py b/operate/models/apis.py index 8decea15..9bd03252 100644 --- a/operate/models/apis.py +++ b/operate/models/apis.py @@ -20,7 +20,7 @@ get_user_prompt, get_system_prompt, ) -from operate.utils.ocr import get_text_element +from operate.utils.ocr import get_text_element, get_text_coordinates from operate.utils.label import ( @@ -271,11 +271,17 @@ async def call_gpt_4_vision_preview_ocr(messages, objective): result = reader.readtext(screenshot_filename) text_element_index = get_text_element(result, text_to_click) + coordinates = get_text_coordinates(result, text_element_index) + if VERBOSE: print( "[call_gpt_4_vision_preview_ocr][click] text_element_index", text_element_index, ) + print( + "[call_gpt_4_vision_preview_ocr][click] coordinates", + coordinates, + ) else: processed_content.append(operation) diff --git a/operate/utils/ocr.py b/operate/utils/ocr.py index f918607a..ce6cecd5 100644 --- a/operate/utils/ocr.py +++ b/operate/utils/ocr.py @@ -30,3 +30,31 @@ def get_text_element(result, search_text): return index raise Exception("The text element was not found in the image") + + +def get_text_coordinates(result, index): + """ + Gets the center coordinates of the text element at the specified index. + Args: + result (list): The list of results returned by EasyOCR. + index (int): The index of the text element in the results list. + + Returns: + dict: A dictionary containing the 'x' and 'y' coordinates of the center of the text element. + """ + if index >= len(result): + raise Exception("Index out of range in OCR results") + + # Get the bounding box of the text element + bounding_box = result[index][0] + + # Calculate the center of the bounding box + min_x = min([coord[0] for coord in bounding_box]) + max_x = max([coord[0] for coord in bounding_box]) + min_y = min([coord[1] for coord in bounding_box]) + max_y = max([coord[1] for coord in bounding_box]) + + center_x = (min_x + max_x) / 2 + center_y = (min_y + max_y) / 2 + + return {"x": center_x, "y": center_y} From ff7a716dfa44569349a187fcac6841f34297c3fd Mon Sep 17 00:00:00 2001 From: Josh Bickett Date: Sun, 21 Jan 2024 12:16:26 -0800 Subject: [PATCH 07/22] Change dir to `labeled_images_dir` --- operate/utils/label.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/operate/utils/label.py b/operate/utils/label.py index 14232391..20d55fee 100644 --- a/operate/utils/label.py +++ b/operate/utils/label.py @@ -68,11 +68,11 @@ def add_labels(base64_data, yolo_model): ) # Create a separate draw object for the debug image font_size = 45 - detections_dir = "detections" + labeled_images_dir = "labeled_images" label_coordinates = {} # Dictionary to store coordinates - if not os.path.exists(detections_dir): - os.makedirs(detections_dir) + if not os.path.exists(labeled_images_dir): + os.makedirs(labeled_images_dir) counter = 0 drawn_boxes = [] # List to keep track of boxes already drawn @@ -116,9 +116,11 @@ def add_labels(base64_data, yolo_model): # Save the image timestamp = time.strftime("%Y%m%d-%H%M%S") - output_path = os.path.join(detections_dir, f"img_{timestamp}_labeled.png") - output_path_debug = os.path.join(detections_dir, f"img_{timestamp}_debug.png") - output_path_original = os.path.join(detections_dir, f"img_{timestamp}_original.png") + output_path = os.path.join(labeled_images_dir, f"img_{timestamp}_labeled.png") + output_path_debug = os.path.join(labeled_images_dir, f"img_{timestamp}_debug.png") + output_path_original = os.path.join( + labeled_images_dir, f"img_{timestamp}_original.png" + ) image_labeled.save(output_path) image_debug.save(output_path_debug) From 5f5ed69f33d451a5928f2c419599ffcaf3564625 Mon Sep 17 00:00:00 2001 From: Josh Bickett Date: Sun, 21 Jan 2024 12:16:52 -0800 Subject: [PATCH 08/22] Iterate `get_text_coordinates` --- operate/models/apis.py | 13 ++++++++++++- operate/models/prompts.py | 4 ++-- operate/utils/ocr.py | 18 ++++++++++++++---- 3 files changed, 28 insertions(+), 7 deletions(-) diff --git a/operate/models/apis.py b/operate/models/apis.py index 9bd03252..f8e440e5 100644 --- a/operate/models/apis.py +++ b/operate/models/apis.py @@ -271,7 +271,13 @@ async def call_gpt_4_vision_preview_ocr(messages, objective): result = reader.readtext(screenshot_filename) text_element_index = get_text_element(result, text_to_click) - coordinates = get_text_coordinates(result, text_element_index) + coordinates = get_text_coordinates( + result, text_element_index, screenshot_filename + ) + + # add `coordinates`` to `content` + operation["x"] = coordinates["x"] + operation["y"] = coordinates["y"] if VERBOSE: print( @@ -282,6 +288,11 @@ async def call_gpt_4_vision_preview_ocr(messages, objective): "[call_gpt_4_vision_preview_ocr][click] coordinates", coordinates, ) + print( + "[call_gpt_4_vision_preview_ocr][click] final operation", + operation, + ) + processed_content.append(operation) else: processed_content.append(operation) diff --git a/operate/models/prompts.py b/operate/models/prompts.py index e200bb37..c8338eab 100644 --- a/operate/models/prompts.py +++ b/operate/models/prompts.py @@ -321,8 +321,8 @@ def get_system_prompt(model, objective): else: prompt = SYSTEM_PROMPT_WIN_LINUX.format(objective=objective) - if VERBOSE: - print("[get_system_prompt] prompt", prompt) + # if VERBOSE: + # print("[get_system_prompt] prompt", prompt) return prompt diff --git a/operate/utils/ocr.py b/operate/utils/ocr.py index ce6cecd5..41354eee 100644 --- a/operate/utils/ocr.py +++ b/operate/utils/ocr.py @@ -1,4 +1,5 @@ from operate.config import Config +from PIL import Image # Load configuration VERBOSE = Config().verbose @@ -32,15 +33,16 @@ def get_text_element(result, search_text): raise Exception("The text element was not found in the image") -def get_text_coordinates(result, index): +def get_text_coordinates(result, index, image_path): """ - Gets the center coordinates of the text element at the specified index. + Gets the coordinates of the text element at the specified index as a percentage of screen width and height. Args: result (list): The list of results returned by EasyOCR. index (int): The index of the text element in the results list. + image_path (str): Path to the screenshot image. Returns: - dict: A dictionary containing the 'x' and 'y' coordinates of the center of the text element. + dict: A dictionary containing the 'x' and 'y' coordinates as percentages of the screen width and height. """ if index >= len(result): raise Exception("Index out of range in OCR results") @@ -57,4 +59,12 @@ def get_text_coordinates(result, index): center_x = (min_x + max_x) / 2 center_y = (min_y + max_y) / 2 - return {"x": center_x, "y": center_y} + # Get image dimensions + with Image.open(image_path) as img: + width, height = img.size + + # Convert to percentages + percent_x = round((center_x / width), 1) + percent_y = round((center_y / height), 1) + + return {"x": percent_x, "y": percent_y} From 4790a4fa46d46a66e2b4bc4135d20e6f955f10cf Mon Sep 17 00:00:00 2001 From: Josh Bickett Date: Sun, 21 Jan 2024 12:30:09 -0800 Subject: [PATCH 09/22] Fix `get_text_coordinates` --- operate/models/apis.py | 4 +++- operate/utils/ocr.py | 41 +++++++++++++++++++++++++++++++++++------ 2 files changed, 38 insertions(+), 7 deletions(-) diff --git a/operate/models/apis.py b/operate/models/apis.py index f8e440e5..08e9e96a 100644 --- a/operate/models/apis.py +++ b/operate/models/apis.py @@ -270,7 +270,9 @@ async def call_gpt_4_vision_preview_ocr(messages, objective): # Read the screenshot result = reader.readtext(screenshot_filename) - text_element_index = get_text_element(result, text_to_click) + text_element_index = get_text_element( + result, text_to_click, screenshot_filename + ) coordinates = get_text_coordinates( result, text_element_index, screenshot_filename ) diff --git a/operate/utils/ocr.py b/operate/utils/ocr.py index 41354eee..90b66eda 100644 --- a/operate/utils/ocr.py +++ b/operate/utils/ocr.py @@ -1,16 +1,18 @@ from operate.config import Config -from PIL import Image +from PIL import Image, ImageDraw +import os # Load configuration VERBOSE = Config().verbose -def get_text_element(result, search_text): +def get_text_element(result, search_text, image_path): """ - Searches for a text element in the OCR results and returns its index. + Searches for a text element in the OCR results and returns its index. Also draws bounding boxes on the image. Args: result (list): The list of results returned by EasyOCR. search_text (str): The text to search for in the OCR results. + image_path (str): Path to the original image. Returns: int: The index of the element containing the search text. @@ -21,15 +23,42 @@ def get_text_element(result, search_text): if VERBOSE: print("[get_text_element]") print("[get_text_element] search_text", search_text) + # Create /ocr directory if it doesn't exist + ocr_dir = "ocr" + if not os.path.exists(ocr_dir): + os.makedirs(ocr_dir) + + # Open the original image + image = Image.open(image_path) + draw = ImageDraw.Draw(image) + + found_index = None for index, element in enumerate(result): text = element[1] + box = element[0] + if VERBOSE: + # Draw bounding box in blue + draw.polygon([tuple(point) for point in box], outline="blue") print("[get_text_element][loop] text", text) + if search_text in text: + found_index = index if VERBOSE: print("[get_text_element][loop] found search_text, index:", index) - return index + if found_index is not None: + if VERBOSE: + # Draw bounding box of the found text in red + box = result[found_index][0] + draw.polygon([tuple(point) for point in box], outline="red") + # Save the image with bounding boxes + ocr_image_path = os.path.join(ocr_dir, "ocr_image.png") + image.save(ocr_image_path) + print("[get_text_element] OCR image saved at:", ocr_image_path) + + return found_index + raise Exception("The text element was not found in the image") @@ -64,7 +93,7 @@ def get_text_coordinates(result, index, image_path): width, height = img.size # Convert to percentages - percent_x = round((center_x / width), 1) - percent_y = round((center_y / height), 1) + percent_x = round((center_x / width), 3) + percent_y = round((center_y / height), 3) return {"x": percent_x, "y": percent_y} From 3281fac9fba1aa3a0cf3d3720894e01d1c2fc431 Mon Sep 17 00:00:00 2001 From: Josh Bickett Date: Sun, 21 Jan 2024 12:58:47 -0800 Subject: [PATCH 10/22] iterate get text function --- operate/config.py | 2 +- operate/utils/ocr.py | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/operate/config.py b/operate/config.py index d68382da..53df0361 100644 --- a/operate/config.py +++ b/operate/config.py @@ -26,7 +26,7 @@ def __new__(cls): def __init__(self): load_dotenv() - self.verbose = False + self.verbose = True self.openai_api_key = ( None # instance variables are backups in case saving to a `.env` fails ) diff --git a/operate/utils/ocr.py b/operate/utils/ocr.py index 90b66eda..c6ecc599 100644 --- a/operate/utils/ocr.py +++ b/operate/utils/ocr.py @@ -1,6 +1,7 @@ from operate.config import Config from PIL import Image, ImageDraw import os +from datetime import datetime # Load configuration VERBOSE = Config().verbose @@ -53,7 +54,8 @@ def get_text_element(result, search_text, image_path): box = result[found_index][0] draw.polygon([tuple(point) for point in box], outline="red") # Save the image with bounding boxes - ocr_image_path = os.path.join(ocr_dir, "ocr_image.png") + datetime_str = datetime.now().strftime("%Y%m%d_%H%M%S") + ocr_image_path = os.path.join(ocr_dir, f"ocr_image_{datetime_str}.png") image.save(ocr_image_path) print("[get_text_element] OCR image saved at:", ocr_image_path) From df1cbd245525a53a2aab3fa72ea1a45ffc727d81 Mon Sep 17 00:00:00 2001 From: Josh Bickett Date: Sun, 21 Jan 2024 13:12:49 -0800 Subject: [PATCH 11/22] add `content_str` for proper message structure --- operate/models/apis.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/operate/models/apis.py b/operate/models/apis.py index 08e9e96a..2a25196d 100644 --- a/operate/models/apis.py +++ b/operate/models/apis.py @@ -250,6 +250,8 @@ async def call_gpt_4_vision_preview_ocr(messages, objective): if content.endswith("```"): content = content[: -len("```")] # Remove ending + content_str = content + content = json.loads(content) if VERBOSE: print("[call_gpt_4_vision_preview_ocr] content", content) @@ -300,12 +302,7 @@ async def call_gpt_4_vision_preview_ocr(messages, objective): processed_content.append(operation) # wait to append the assistant message so that if the `processed_content` step fails we don't append a message and mess up message history - assistant_message = {"role": "assistant", "content": content} - if VERBOSE: - print( - "[call_gpt_4_vision_preview_labeled] content", - content, - ) + assistant_message = {"role": "assistant", "content": content_str} messages.append(assistant_message) return processed_content From 14f3a8a255ff8e3e1b742a8ff7bd2605b1e8ed6f Mon Sep 17 00:00:00 2001 From: Josh Bickett Date: Sun, 21 Jan 2024 13:19:51 -0800 Subject: [PATCH 12/22] Adjust `SYSTEM_PROMPT_OCR_MAC` --- operate/config.py | 3 --- operate/models/prompts.py | 12 ++++++++++-- operate/utils/ocr.py | 1 - 3 files changed, 10 insertions(+), 6 deletions(-) diff --git a/operate/config.py b/operate/config.py index 53df0361..6d69158d 100644 --- a/operate/config.py +++ b/operate/config.py @@ -49,9 +49,6 @@ def initialize_openai(self): ) api_key = os.getenv("OPENAI_API_KEY") - if self.verbose: - print("[Config][initialize_openai] api_key", api_key) - client = OpenAI( api_key=api_key, ) diff --git a/operate/models/prompts.py b/operate/models/prompts.py index c8338eab..af388809 100644 --- a/operate/models/prompts.py +++ b/operate/models/prompts.py @@ -221,15 +221,23 @@ Return the actions in array format `[]`. You can take just one action or multiple actions. -Here are some helpful combinations: +Here a helpful example: -# Opens Spotlight Search on Mac +# Opens Spotlight Search on Mac and see if Google Chrome is available to use [ {{ "thought": "Searching the operating system to find Google Chrome because it appears I am currently in terminal", "operation": "press", "keys": ["command", "space"] }}, {{ "thought": "Now I need to write 'Google Chrome' as a next step", "operation": "write", "content": "Google Chrome" }}, {{ "thought": "Finally I'll press enter to open Google Chrome assuming it is available", "operation": "press", "keys": ["enter"] }} ] +# Go to a website (LinkedIn) when the browser is already open + +[ + {{ "thought": "I can see that Google Chrome is open. I'll focus on the address bar to type ", "operation": "press", "keys": ["command", "l"] }}, + {{ "thought": "Now I'll write LinkedIn's website to go there", "operation": "write", "content": "https://www.linkedin.com/feed/" }}, + {{ "thought": "Finally I'll press enter to go to LinkedIn", "operation": "press", "keys": ["enter"] }} +] + # Search for someone on Linkedin when already on linkedin.com [ {{ "thought": "I can see the search field with the placeholder text 'search'. I click that field to search", "operation": "click", "text": "search" }}, diff --git a/operate/utils/ocr.py b/operate/utils/ocr.py index c6ecc599..68b4d435 100644 --- a/operate/utils/ocr.py +++ b/operate/utils/ocr.py @@ -41,7 +41,6 @@ def get_text_element(result, search_text, image_path): if VERBOSE: # Draw bounding box in blue draw.polygon([tuple(point) for point in box], outline="blue") - print("[get_text_element][loop] text", text) if search_text in text: found_index = index From 2716b9217242b92b48c4afe1863b11d1cb5791a5 Mon Sep 17 00:00:00 2001 From: Josh Bickett Date: Sun, 21 Jan 2024 13:24:38 -0800 Subject: [PATCH 13/22] Add `easyocr` to `requirements.txt --- requirements.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 2c796cd9..f2727e69 100644 --- a/requirements.txt +++ b/requirements.txt @@ -49,4 +49,5 @@ wcwidth==0.2.9 zipp==3.17.0 google-generativeai==0.3.0 aiohttp==3.9.1 -ultralytics==8.0.227 \ No newline at end of file +ultralytics==8.0.227 +easyocr==1.7.1 \ No newline at end of file From 817d137eedc50838df3ee6ca55916b3636f310d4 Mon Sep 17 00:00:00 2001 From: Josh Bickett Date: Sun, 21 Jan 2024 13:24:50 -0800 Subject: [PATCH 14/22] Update `SYSTEM_PROMPT_OCR_WIN_LINUX` --- operate/models/prompts.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/operate/models/prompts.py b/operate/models/prompts.py index af388809..14b89ee9 100644 --- a/operate/models/prompts.py +++ b/operate/models/prompts.py @@ -273,13 +273,21 @@ Here are some helpful combinations: -# Opens Menu Search on Windows and Linux +# Opens Spotlight Search on Mac and see if Google Chrome is available to use [ {{ "thought": "Searching the operating system to find Google Chrome because it appears I am currently in terminal", "operation": "press", "keys": ["win"] }}, {{ "thought": "Now I need to write 'Google Chrome' as a next step", "operation": "write", "content": "Google Chrome" }}, {{ "thought": "Finally I'll press enter to open Google Chrome assuming it is available", "operation": "press", "keys": ["enter"] }} ] +# Go to a website (LinkedIn) when the browser is already open + +[ + {{ "thought": "I can see that Google Chrome is open. I'll focus on the address bar to type ", "operation": "press", "keys": ["ctrl", "l"] }}, + {{ "thought": "Now I'll write LinkedIn's website to go there", "operation": "write", "content": "https://www.linkedin.com/feed/" }}, + {{ "thought": "Finally I'll press enter to go to LinkedIn", "operation": "press", "keys": ["enter"] }} +] + # Search for someone on Linkedin when already on linkedin.com [ {{ "thought": "I can see the search field with the placeholder text 'search'. I click that field to search", "operation": "click", "text": "search" }}, From ff35f17a9b80624d903527a251852f0a69a1242c Mon Sep 17 00:00:00 2001 From: Josh Bickett Date: Mon, 22 Jan 2024 07:33:33 -0800 Subject: [PATCH 15/22] Create `gpt_4_fallback` method --- operate/models/apis.py | 52 +++++++++++++++++++++++++++++++++++++----- 1 file changed, 46 insertions(+), 6 deletions(-) diff --git a/operate/models/apis.py b/operate/models/apis.py index 2a25196d..64499774 100644 --- a/operate/models/apis.py +++ b/operate/models/apis.py @@ -51,7 +51,7 @@ async def get_next_action(model, messages, objective, session_id): operation = await call_gpt_4_vision_preview_labeled(messages, objective) return operation, None if model == "gpt-4-with-ocr": - operation = await call_gpt_4_vision_preview_ocr(messages, objective) + operation = await call_gpt_4_vision_preview_ocr(messages, objective, model) return operation, None elif model == "agent-1": return "coming soon" @@ -192,14 +192,16 @@ def call_gemini_pro_vision(messages, objective): return call_gpt_4_vision_preview(messages) -async def call_gpt_4_vision_preview_ocr(messages, objective): +async def call_gpt_4_vision_preview_ocr(messages, objective, model): if VERBOSE: - print("[call_gpt_4_vision_preview_ocr] extracted_text") - time.sleep(1) - client = config.initialize_openai() + print("[call_gpt_4_vision_preview_ocr]") # Construct the path to the file within the package try: + time.sleep(1) + client = config.initialize_openai() + + confirm_system_prompt(messages, objective, model) screenshots_dir = "screenshots" if not os.path.exists(screenshots_dir): os.makedirs(screenshots_dir) @@ -312,7 +314,8 @@ async def call_gpt_4_vision_preview_ocr(messages, objective): f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] Something went wrong. Trying another method {ANSI_RESET}", e, ) - return call_gpt_4_vision_preview(messages) + traceback.print_exc() + return gpt_4_fallback(messages, objective, model) async def call_gpt_4_vision_preview_labeled(messages, objective): @@ -447,6 +450,7 @@ async def call_gpt_4_vision_preview_labeled(messages, objective): f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] Something went wrong. Trying another method {ANSI_RESET}", e, ) + traceback.print_exc() return call_gpt_4_vision_preview(messages) @@ -462,3 +466,39 @@ def get_last_assistant_message(messages): else: return messages[index] return None # Return None if no assistant message is found + + +def gpt_4_fallback(messages, objective, model): + if VERBOSE: + print("[gpt_4_fallback]") + system_prompt = get_system_prompt("gpt-4-vision-preview", objective) + new_system_message = {"role": "system", "content": system_prompt} + # remove and replace the first message in `messages` with `new_system_message` + + messages[0] = new_system_message + if VERBOSE: + print("[gpt_4_fallback] new messages", messages) + + if VERBOSE: + print("[gpt_4_fallback][updated]") + print("[gpt_4_fallback][updated] len(messages)", len(messages)) + + return call_gpt_4_vision_preview(messages) + + +def confirm_system_prompt(messages, objective, model): + """ + On `Exception` we default to `call_gpt_4_vision_preview` so we have this function to reassign system prompt in case of a previous failure + """ + if VERBOSE: + print("[confirm_system_prompt]") + + system_prompt = get_system_prompt(model, objective) + new_system_message = {"role": "system", "content": system_prompt} + # remove and replace the first message in `messages` with `new_system_message` + + messages[0] = new_system_message + + if VERBOSE: + print("[confirm_system_prompt][updated]") + print("[confirm_system_prompt][updated] len(messages)", len(messages)) From 135b3ea6e4a79d85946bdf23c178c3a96cea6075 Mon Sep 17 00:00:00 2001 From: Josh Bickett Date: Mon, 22 Jan 2024 07:33:53 -0800 Subject: [PATCH 16/22] New click prompt 'no button' approach for `SYSTEM_PROMPT_OCR_MAC` --- operate/models/prompts.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/operate/models/prompts.py b/operate/models/prompts.py index 14b89ee9..2bac4cb0 100644 --- a/operate/models/prompts.py +++ b/operate/models/prompts.py @@ -208,7 +208,7 @@ You have 4 possible operation actions available to you. The `pyautogui` library will be used to execute your decision. Your output will be used in a `json.loads` loads statement. 1. click - Move mouse and click -[{{ "thought": "write a thought here", "operation": "click", "text": "The text in the button or link to click" }}] # Look for buttons and links with the text. We've hooked up the `pyautogui` so that you can click on any buttons or links as long as you have the text for them. +[{{ "thought": "write a thought here", "operation": "click", "text": "The text in the button or link to click" }}] # Look for buttons or links with text to click. If the button you want to click doesn't have text you can say `"no button"` for the text value and we'll try a different method. 2. write - Write with your keyboard [{{ "thought": "write a thought here", "operation": "write", "content": "text to write here" }}] @@ -233,7 +233,7 @@ # Go to a website (LinkedIn) when the browser is already open [ - {{ "thought": "I can see that Google Chrome is open. I'll focus on the address bar to type ", "operation": "press", "keys": ["command", "l"] }}, + {{ "thought": "I can see that Google Chrome is open. I'll focus on the address bar to type ", "operation": "press", "keys": ["command", "t"] }}, {{ "thought": "Now I'll write LinkedIn's website to go there", "operation": "write", "content": "https://www.linkedin.com/feed/" }}, {{ "thought": "Finally I'll press enter to go to LinkedIn", "operation": "press", "keys": ["enter"] }} ] @@ -258,7 +258,7 @@ You have 4 possible operation actions available to you. The `pyautogui` library will be used to execute your decision. Your output will be used in a `json.loads` loads statement. 1. click - Move mouse and click -[{{ "thought": "write a thought here", "operation": "click", "text": "The text in the button or link to click" }}] # Look for buttons and links with the text. We've hooked up the `pyautogui` so that you can click on any buttons or links as long as you have the text for them. +[{{ "thought": "write a thought here", "operation": "click", "text": "The text in the button or link to click" }}] # Look for buttons or links with text to click. If the button you want to click doesn't have text you can say `"no button"` for the text value and we'll try a different method. 2. write - Write with your keyboard [{{ "thought": "write a thought here", "operation": "write", "content": "text to write here" }}] @@ -283,7 +283,7 @@ # Go to a website (LinkedIn) when the browser is already open [ - {{ "thought": "I can see that Google Chrome is open. I'll focus on the address bar to type ", "operation": "press", "keys": ["ctrl", "l"] }}, + {{ "thought": "I can see that Google Chrome is open. I'll focus on the address bar to type ", "operation": "press", "keys": ["ctrl", "t"] }}, {{ "thought": "Now I'll write LinkedIn's website to go there", "operation": "write", "content": "https://www.linkedin.com/feed/" }}, {{ "thought": "Finally I'll press enter to go to LinkedIn", "operation": "press", "keys": ["enter"] }} ] From 849cc7efba40b16dc4d9ab6a32b225d4a57f8a20 Mon Sep 17 00:00:00 2001 From: Josh Bickett Date: Tue, 23 Jan 2024 19:12:39 -0800 Subject: [PATCH 17/22] update `get_system_prompt` --- operate/models/prompts.py | 55 +++++++++++++++++++++++++-------------- 1 file changed, 35 insertions(+), 20 deletions(-) diff --git a/operate/models/prompts.py b/operate/models/prompts.py index 2bac4cb0..5ba4ec67 100644 --- a/operate/models/prompts.py +++ b/operate/models/prompts.py @@ -317,28 +317,43 @@ def get_system_prompt(model, objective): """ - Format the vision prompt + Format the vision prompt more efficiently and print the name of the prompt used """ + + prompt_map = { + ("gpt-4-with-som", "Darwin"): ( + SYSTEM_PROMPT_LABELED_MAC, + "SYSTEM_PROMPT_LABELED_MAC", + ), + ("gpt-4-with-som", "Other"): ( + SYSTEM_PROMPT_LABELED_WIN_LINUX, + "SYSTEM_PROMPT_LABELED_WIN_LINUX", + ), + ("gpt-4-with-ocr", "Darwin"): (SYSTEM_PROMPT_OCR_MAC, "SYSTEM_PROMPT_OCR_MAC"), + ("gpt-4-with-ocr", "Other"): ( + SYSTEM_PROMPT_OCR_WIN_LINUX, + "SYSTEM_PROMPT_OCR_WIN_LINUX", + ), + ("default", "Darwin"): (SYSTEM_PROMPT_MAC, "SYSTEM_PROMPT_MAC"), + ("default", "Other"): (SYSTEM_PROMPT_WIN_LINUX, "SYSTEM_PROMPT_WIN_LINUX"), + } + + os_type = "Darwin" if platform.system() == "Darwin" else "Other" + + # Fetching the prompt tuple (string and name) based on the model and OS + prompt_tuple = prompt_map.get((model, os_type), prompt_map[("default", os_type)]) + + # Extracting the prompt string and its name + prompt_string, prompt_name = prompt_tuple + + # Formatting the prompt + prompt = prompt_string.format(objective=objective) + + # Optional verbose output if VERBOSE: - print("[get_system_prompt] model", model) - if model == "gpt-4-with-som": - if platform.system() == "Darwin": - prompt = SYSTEM_PROMPT_LABELED_MAC.format(objective=objective) - else: - prompt = SYSTEM_PROMPT_LABELED_WIN_LINUX.format(objective=objective) - elif model == "gpt-4-with-ocr": - if platform.system() == "Darwin": - prompt = SYSTEM_PROMPT_OCR_MAC.format(objective=objective) - else: - prompt = SYSTEM_PROMPT_OCR_WIN_LINUX.format(objective=objective) - else: - if platform.system() == "Darwin": - prompt = SYSTEM_PROMPT_MAC.format(objective=objective) - else: - prompt = SYSTEM_PROMPT_WIN_LINUX.format(objective=objective) - - # if VERBOSE: - # print("[get_system_prompt] prompt", prompt) + print("[get_system_prompt] model:", model) + print("[get_system_prompt] prompt name:", prompt_name) + # print("[get_system_prompt] prompt:", prompt) return prompt From e1fa95374d2b4322be06722f0131ac1a551b0d23 Mon Sep 17 00:00:00 2001 From: Josh Bickett Date: Wed, 24 Jan 2024 08:24:30 -0800 Subject: [PATCH 18/22] Turn off `verbose` --- operate/config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/operate/config.py b/operate/config.py index 6d69158d..0254862f 100644 --- a/operate/config.py +++ b/operate/config.py @@ -26,7 +26,7 @@ def __new__(cls): def __init__(self): load_dotenv() - self.verbose = True + self.verbose = False self.openai_api_key = ( None # instance variables are backups in case saving to a `.env` fails ) From 39ddc45dd6879b93494d47068b7bf149162b114c Mon Sep 17 00:00:00 2001 From: Josh Bickett Date: Wed, 24 Jan 2024 08:27:04 -0800 Subject: [PATCH 19/22] Add missing `if self.verbose:` --- operate/config.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/operate/config.py b/operate/config.py index 0254862f..581af043 100644 --- a/operate/config.py +++ b/operate/config.py @@ -62,9 +62,10 @@ def initialize_google(self): print("[Config][initialize_google] using cached google_api_key") api_key = self.google_api_key else: - print( - "[Config][initialize_google] no cached google_api_key, try to get from env." - ) + if self.verbose: + print( + "[Config][initialize_google] no cached google_api_key, try to get from env." + ) api_key = os.getenv("GOOGLE_API_KEY") genai.configure(api_key=api_key, transport="rest") model = genai.GenerativeModel("gemini-pro-vision") From e8ffbb5edbfecb722df8b7d6734229cfebbbdce6 Mon Sep 17 00:00:00 2001 From: Josh Bickett Date: Wed, 24 Jan 2024 08:29:12 -0800 Subject: [PATCH 20/22] Default to `gpt-4-with-ocr` since it performs best --- operate/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/operate/main.py b/operate/main.py index 3cf991da..73065854 100644 --- a/operate/main.py +++ b/operate/main.py @@ -15,7 +15,7 @@ def main_entry(): "--model", help="Specify the model to use", required=False, - default="gpt-4", + default="gpt-4-with-ocr", ) # Add a voice flag From 24e688432bdd2e8f8a9f90e7e2792499ce6ae0b7 Mon Sep 17 00:00:00 2001 From: Josh Bickett Date: Wed, 24 Jan 2024 08:40:11 -0800 Subject: [PATCH 21/22] Add `### Optical Character Recognition Mode `-m gpt-4-with-ocr`` --- README.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/README.md b/README.md index 120b83f9..af201d62 100644 --- a/README.md +++ b/README.md @@ -91,6 +91,11 @@ operate -m gemini-pro-vision **Enter your Google AI Studio API key when terminal prompts you for it** If you don't have one, you can obtain a key [here](https://makersuite.google.com/app/apikey) after setting up your Google AI Studio account. You may also need [authorize credentials for a desktop application](https://ai.google.dev/palm_docs/oauth_quickstart). It took me a bit of time to get it working, if anyone knows a simpler way, please make a PR: +### Optical Character Recognition Mode `-m gpt-4-with-ocr` +The Self-Operating Computer Framework now integrates Optical Character Recognition (OCR) capabilities with the `gpt-4-with-ocr` mode. This mode gives GPT-4 a hash map of clickable elements by coordinates. GPT-4 can decide to `click` elements by text and then the code references the hash map to get the coordinates for that element GPT-4 wanted to click. + +Based on recent tests, OCR performs better than `som` and vanilla GPT-4 so we made it the default for the project. To use the OCR mode you can simply write `operate` or `operate -m gpt-4-with-ocr` will also work. + ### Set-of-Mark Prompting `-m gpt-4-with-som` The Self-Operating Computer Framework now supports Set-of-Mark (SoM) Prompting with the `gpt-4-with-som` command. This new visual prompting method enhances the visual grounding capabilities of large multimodal models. From b1cc4faab51cec108a7ddfbb748bf138441f22cc Mon Sep 17 00:00:00 2001 From: Josh Bickett Date: Wed, 24 Jan 2024 08:41:53 -0800 Subject: [PATCH 22/22] Update `readme.md` --- README.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index af201d62..f52399bf 100644 --- a/README.md +++ b/README.md @@ -94,7 +94,9 @@ operate -m gemini-pro-vision ### Optical Character Recognition Mode `-m gpt-4-with-ocr` The Self-Operating Computer Framework now integrates Optical Character Recognition (OCR) capabilities with the `gpt-4-with-ocr` mode. This mode gives GPT-4 a hash map of clickable elements by coordinates. GPT-4 can decide to `click` elements by text and then the code references the hash map to get the coordinates for that element GPT-4 wanted to click. -Based on recent tests, OCR performs better than `som` and vanilla GPT-4 so we made it the default for the project. To use the OCR mode you can simply write `operate` or `operate -m gpt-4-with-ocr` will also work. +Based on recent tests, OCR performs better than `som` and vanilla GPT-4 so we made it the default for the project. To use the OCR mode you can simply write: + + `operate` or `operate -m gpt-4-with-ocr` will also work. ### Set-of-Mark Prompting `-m gpt-4-with-som` The Self-Operating Computer Framework now supports Set-of-Mark (SoM) Prompting with the `gpt-4-with-som` command. This new visual prompting method enhances the visual grounding capabilities of large multimodal models.