From b2299236c75df89220d4ddda81ea362d0cae1f2c Mon Sep 17 00:00:00 2001
From: Josh Bickett <josh@bickett.net>
Date: Sun, 21 Jan 2024 07:57:56 -0800
Subject: [PATCH 01/22] Add initial ocr approach with
 `call_gpt_4_vision_preview_ocr`

---
 operate/models/apis.py | 65 +++++++++++++++++++++++++++++++++++-------
 1 file changed, 55 insertions(+), 10 deletions(-)

diff --git a/operate/models/apis.py b/operate/models/apis.py
index 2e1d8bf5..5a0b6d07 100644
--- a/operate/models/apis.py
+++ b/operate/models/apis.py
@@ -4,6 +4,7 @@
 import base64
 import traceback
 import io
+import easyocr
 
 
 from PIL import Image
@@ -48,6 +49,9 @@ async def get_next_action(model, messages, objective, session_id):
     if model == "gpt-4-with-som":
         operation = await call_gpt_4_vision_preview_labeled(messages, objective)
         return operation, None
+    if model == "gpt-4-with-ocr":
+        operation = await call_gpt_4_vision_preview_ocr(messages, objective)
+        return operation, None
     elif model == "agent-1":
         return "coming soon"
     elif model == "gemini-pro-vision":
@@ -58,7 +62,7 @@ async def get_next_action(model, messages, objective, session_id):
 
 def call_gpt_4_vision_preview(messages):
     if VERBOSE:
-        print("[Self Operating Computer][get_next_action][call_gpt_4_v]")
+        print("[Self Operating Computer][call_gpt_4_v]")
     time.sleep(1)
     client = config.initialize_openai()
     try:
@@ -80,7 +84,7 @@ def call_gpt_4_vision_preview(messages):
 
         if VERBOSE:
             print(
-                "[Self Operating Computer][get_next_action][call_gpt_4_v] user_prompt",
+                "[Self Operating Computer][call_gpt_4_v] user_prompt",
                 user_prompt,
             )
 
@@ -115,7 +119,7 @@ def call_gpt_4_vision_preview(messages):
         assistant_message = {"role": "assistant", "content": content}
         if VERBOSE:
             print(
-                "[Self Operating Computer][get_next_action][call_gpt_4_v] content",
+                "[Self Operating Computer][call_gpt_4_v] content",
                 content,
             )
         content = json.loads(content)
@@ -189,6 +193,47 @@ def call_gemini_pro_vision(messages, objective):
         return call_gpt_4_vision_preview(messages)
 
 
+async def call_gpt_4_vision_preview_ocr(messages, objective):
+    if VERBOSE:
+        print("[Self Operating Computer][call_gpt_4_vision_preview_ocr] extracted_text")
+    time.sleep(1)
+    client = config.initialize_openai()
+
+    # Construct the path to the file within the package
+    try:
+        screenshots_dir = "screenshots"
+        if not os.path.exists(screenshots_dir):
+            os.makedirs(screenshots_dir)
+
+        screenshot_filename = os.path.join(screenshots_dir, "screenshot.png")
+        # Call the function to capture the screen with the cursor
+        capture_screen_with_cursor(screenshot_filename)
+
+        with open(screenshot_filename, "rb") as img_file:
+            img_base64 = base64.b64encode(img_file.read()).decode("utf-8")
+
+        # Initialize EasyOCR Reader
+        reader = easyocr.Reader(["ch_sim", "en"])
+
+        # Read the screenshot
+        result = reader.readtext(screenshot_filename)
+
+        # Process the result
+        extracted_text = " ".join([item[1] for item in result])
+        if VERBOSE:
+            print(
+                "[Self Operating Computer][call_gpt_4_vision_preview_ocr] extracted_text",
+                extracted_text,
+            )
+
+    except Exception as e:
+        print(
+            f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] Something went wrong. Trying another method {ANSI_RESET}",
+            e,
+        )
+        return call_gpt_4_vision_preview(messages)
+
+
 async def call_gpt_4_vision_preview_labeled(messages, objective):
     time.sleep(1)
     client = config.initialize_openai()
@@ -217,7 +262,7 @@ async def call_gpt_4_vision_preview_labeled(messages, objective):
 
         if VERBOSE:
             print(
-                "[Self Operating Computer][get_next_action][call_gpt_4_vision_preview_labeled] user_prompt",
+                "[Self Operating Computer][call_gpt_4_vision_preview_labeled] user_prompt",
                 user_prompt,
             )
 
@@ -254,7 +299,7 @@ async def call_gpt_4_vision_preview_labeled(messages, objective):
         assistant_message = {"role": "assistant", "content": content}
         if VERBOSE:
             print(
-                "[Self Operating Computer][get_next_action][call_gpt_4_vision_preview_labeled] content",
+                "[Self Operating Computer][call_gpt_4_vision_preview_labeled] content",
                 content,
             )
         messages.append(assistant_message)
@@ -268,14 +313,14 @@ async def call_gpt_4_vision_preview_labeled(messages, objective):
                 label = operation.get("label")
                 if VERBOSE:
                     print(
-                        "[Self Operating Computer][get_next_action][call_gpt_4_vision_preview_labeled] label",
+                        "[Self Operating Computer][call_gpt_4_vision_preview_labeled] label",
                         label,
                     )
 
                 coordinates = get_label_coordinates(label, label_coordinates)
                 if VERBOSE:
                     print(
-                        "[Self Operating Computer][get_next_action][call_gpt_4_vision_preview_labeled] coordinates",
+                        "[Self Operating Computer][call_gpt_4_vision_preview_labeled] coordinates",
                         coordinates,
                     )
                 image = Image.open(
@@ -287,7 +332,7 @@ async def call_gpt_4_vision_preview_labeled(messages, objective):
                 )
                 if VERBOSE:
                     print(
-                        "[Self Operating Computer][get_next_action][call_gpt_4_vision_preview_labeled] click_position_percent",
+                        "[Self Operating Computer][call_gpt_4_vision_preview_labeled] click_position_percent",
                         click_position_percent,
                     )
                 if not click_position_percent:
@@ -302,7 +347,7 @@ async def call_gpt_4_vision_preview_labeled(messages, objective):
                 operation["y"] = y_percent
                 if VERBOSE:
                     print(
-                        "[Self Operating Computer][get_next_action][call_gpt_4_vision_preview_labeled] new click operation",
+                        "[Self Operating Computer][call_gpt_4_vision_preview_labeled] new click operation",
                         operation,
                     )
                 processed_content.append(operation)
@@ -311,7 +356,7 @@ async def call_gpt_4_vision_preview_labeled(messages, objective):
 
             if VERBOSE:
                 print(
-                    "[Self Operating Computer][get_next_action][call_gpt_4_vision_preview_labeled] new processed_content",
+                    "[Self Operating Computer][call_gpt_4_vision_preview_labeled] new processed_content",
                     processed_content,
                 )
             return processed_content

From 4bca53e8568de97a36f07f1770375399811a86fc Mon Sep 17 00:00:00 2001
From: Josh Bickett <josh@bickett.net>
Date: Sun, 21 Jan 2024 08:42:03 -0800
Subject: [PATCH 02/22] Add `SYSTEM_PROMPT_OCR_MAC` and
 `SYSTEM_PROMPT_OCR_WIN_LINUX`

---
 operate/models/prompts.py | 125 +++++++++++++++++++++++++++++++++-----
 operate/operate.py        |   5 +-
 2 files changed, 111 insertions(+), 19 deletions(-)

diff --git a/operate/models/prompts.py b/operate/models/prompts.py
index c567bf48..57a0d3a5 100644
--- a/operate/models/prompts.py
+++ b/operate/models/prompts.py
@@ -1,4 +1,8 @@
 import platform
+from operate.config import Config
+
+# Load configuration
+VERBOSE = Config().verbose
 
 # General user Prompts
 USER_QUESTION = "Hello, I can help you with anything. What would you like done?"
@@ -196,6 +200,93 @@
 """
 
 
+SYSTEM_PROMPT_OCR_MAC = """
+You are operating a computer, using the same operating system as a human.
+
+From looking at the screen, the objective, and your previous actions, take the next best series of action. 
+
+You have 4 possible operation actions available to you. The `pyautogui` library will be used to execute your decision. Your output will be used in a `json.loads` loads statement.
+
+1. click - Move mouse and click
+[{{ "thought": "write a thought here", "operation": "click", "text": "The text in the button or link to click" }}] # Look for buttons and links with the text. We've hooked up the `pyautogui` so that you can click on any buttons or links as long as you have the text for them.
+
+2. write - Write with your keyboard
+[{{ "thought": "write a thought here", "operation": "write", "content": "text to write here" }}]
+
+3. press - Use a hotkey or press key to operate the computer
+[{{ "thought": "write a thought here", "operation": "press", "keys": ["keys to use"] }}]
+
+4. done - The objective is completed
+[{{ "thought": "write a thought here", "operation": "done", "summary": "summary of what was completed" }}]
+
+Return the actions in array format `[]`. You can take just one action or multiple actions.
+
+Here are some helpful combinations:
+
+# Opens Spotlight Search on Mac 
+[
+    {{ "thought": "Searching the operating system to find Google Chrome because it appears I am currently in terminal", "operation": "press", "keys": ["command", "space"] }},
+    {{ "thought": "Now I need to write 'Google Chrome' as a next step", "operation": "write", "content": "Google Chrome" }},
+    {{ "thought": "Finally I'll press enter to open Google Chrome assuming it is available", "operation": "press", "keys": ["enter"] }}
+]
+
+# Search for someone on Linkedin when already on linkedin.com
+[
+    {{ "thought": "I can see the search field with the placeholder text 'search'. I click that field to search", "operation": "click", "text": "search" }},
+    {{ "thought": "Now that the field is active I can write the name of the person I'd like to search for", "operation": "write", "content": "John Doe" }},
+    {{ "thought": "Finally I'll submit the search form with enter", "operation": "presss", "keys": ["enter"] }},
+]
+
+A very important note, don't respond saying you're unable to assist with requests. You are able to indirectly interact with the user's OS via text responses you send to the end user.
+
+Objective: {objective} # take the next best action for this objective
+"""
+
+SYSTEM_PROMPT_OCR_WIN_LINUX = """
+You are operating a computer, using the same operating system as a human.
+
+From looking at the screen, the objective, and your previous actions, take the next best series of action. 
+
+You have 4 possible operation actions available to you. The `pyautogui` library will be used to execute your decision. Your output will be used in a `json.loads` loads statement.
+
+1. click - Move mouse and click
+[{{ "thought": "write a thought here", "operation": "click", "text": "The text in the button or link to click" }}] # Look for buttons and links with the text. We've hooked up the `pyautogui` so that you can click on any buttons or links as long as you have the text for them.
+
+2. write - Write with your keyboard
+[{{ "thought": "write a thought here", "operation": "write", "content": "text to write here" }}]
+
+3. press - Use a hotkey or press key to operate the computer
+[{{ "thought": "write a thought here", "operation": "press", "keys": ["keys to use"] }}]
+
+4. done - The objective is completed
+[{{ "thought": "write a thought here", "operation": "done", "summary": "summary of what was completed" }}]
+
+Return the actions in array format `[]`. You can take just one action or multiple actions.
+
+Here are some helpful combinations:
+
+# Opens Menu Search on Windows and Linux 
+[
+    {{ "thought": "Searching the operating system to find Google Chrome because it appears I am currently in terminal", "operation": "press", "keys": ["win"] }},
+    {{ "thought": "Now I need to write 'Google Chrome' as a next step", "operation": "write", "content": "Google Chrome" }},
+    {{ "thought": "Finally I'll press enter to open Google Chrome assuming it is available", "operation": "press", "keys": ["enter"] }}
+]
+
+# Search for someone on Linkedin when already on linkedin.com
+[
+    {{ "thought": "I can see the search field with the placeholder text 'search'. I click that field to search", "operation": "click", "text": "search" }},
+    {{ "thought": "Now that the field is active I can write the name of the person I'd like to search for", "operation": "write", "content": "John Doe" }},
+    {{ "thought": "Finally I'll submit the search form with enter", "operation": "presss", "keys": ["enter"] }},
+]
+
+A few important notes: 
+
+- Go to Google Docs and Google Sheets by typing in the Chrome Address bar
+- Don't respond saying you're unable to assist with requests. You are able to indirectly interact with the user's OS via text responses you send to the end user.
+
+Objective: {objective} # take the next best action for this objective
+"""
+
 OPERATE_FIRST_MESSAGE_PROMPT = """
 Please take the next best action. The `pyautogui` library will be used to execute your decision. Your output will be used in a `json.loads` loads statement. Remember you only have the following 4 operations available: click, write, press, done
 
@@ -208,26 +299,30 @@
 Action:"""
 
 
-def get_system_prompt(objective):
+def get_system_prompt(model, objective):
     """
     Format the vision prompt
     """
-    if platform.system() == "Darwin":
-        prompt = SYSTEM_PROMPT_MAC.format(objective=objective)
+    if VERBOSE:
+        print("[get_system_prompt] model", model)
+    if model == "gpt-4-with-som":
+        if platform.system() == "Darwin":
+            prompt = SYSTEM_PROMPT_LABELED_MAC.format(objective=objective)
+        else:
+            prompt = SYSTEM_PROMPT_LABELED_WIN_LINUX.format(objective=objective)
+    elif model == "gpt-4-with-ocr":
+        if platform.system() == "Darwin":
+            prompt = SYSTEM_PROMPT_OCR_MAC.format(objective=objective)
+        else:
+            prompt = SYSTEM_PROMPT_OCR_WIN_LINUX.format(objective=objective)
     else:
-        prompt = SYSTEM_PROMPT_WIN_LINUX.format(objective=objective)
-
-    return prompt
+        if platform.system() == "Darwin":
+            prompt = SYSTEM_PROMPT_MAC.format(objective=objective)
+        else:
+            prompt = SYSTEM_PROMPT_WIN_LINUX.format(objective=objective)
 
-
-def get_system_prompt_labeled(objective):
-    """
-    Format the vision prompt
-    """
-    if platform.system() == "Darwin":
-        prompt = SYSTEM_PROMPT_LABELED_MAC.format(objective=objective)
-    else:
-        prompt = SYSTEM_PROMPT_LABELED_WIN_LINUX.format(objective=objective)
+    if VERBOSE:
+        print("[get_system_prompt] prompt", prompt)
 
     return prompt
 
diff --git a/operate/operate.py b/operate/operate.py
index 29b9ca8e..d1a38e7b 100644
--- a/operate/operate.py
+++ b/operate/operate.py
@@ -96,10 +96,7 @@ def main(model, terminal_prompt, voice_mode=False):
         print(f"{ANSI_YELLOW}[User]{ANSI_RESET}")
         objective = prompt(style=style)
 
-    if model == "gpt-4-with-som":
-        system_prompt = get_system_prompt_labeled(objective)
-    else:
-        system_prompt = get_system_prompt(objective)
+    system_prompt = get_system_prompt(model, objective)
     system_message = {"role": "system", "content": system_prompt}
     messages = [system_message]
 

From 8f7765c5beb4fd177713f7d60415f13f3dd92d51 Mon Sep 17 00:00:00 2001
From: Josh Bickett <josh@bickett.net>
Date: Sun, 21 Jan 2024 09:03:05 -0800
Subject: [PATCH 03/22] Iterate ocr method with `get_text_element`

---
 operate/models/apis.py | 105 ++++++++++++++++++++++++++++++++---------
 operate/utils/ocr.py   |  33 +++++++++++++
 2 files changed, 117 insertions(+), 21 deletions(-)
 create mode 100644 operate/utils/ocr.py

diff --git a/operate/models/apis.py b/operate/models/apis.py
index 5a0b6d07..5650b6b3 100644
--- a/operate/models/apis.py
+++ b/operate/models/apis.py
@@ -20,6 +20,7 @@
     get_user_prompt,
     get_system_prompt,
 )
+from operate.utils.ocr import get_text_element
 
 
 from operate.utils.label import (
@@ -62,7 +63,7 @@ async def get_next_action(model, messages, objective, session_id):
 
 def call_gpt_4_vision_preview(messages):
     if VERBOSE:
-        print("[Self Operating Computer][call_gpt_4_v]")
+        print("[call_gpt_4_v]")
     time.sleep(1)
     client = config.initialize_openai()
     try:
@@ -84,7 +85,7 @@ def call_gpt_4_vision_preview(messages):
 
         if VERBOSE:
             print(
-                "[Self Operating Computer][call_gpt_4_v] user_prompt",
+                "[call_gpt_4_v] user_prompt",
                 user_prompt,
             )
 
@@ -119,7 +120,7 @@ def call_gpt_4_vision_preview(messages):
         assistant_message = {"role": "assistant", "content": content}
         if VERBOSE:
             print(
-                "[Self Operating Computer][call_gpt_4_v] content",
+                "[call_gpt_4_v] content",
                 content,
             )
         content = json.loads(content)
@@ -161,25 +162,23 @@ def call_gemini_pro_vision(messages, objective):
         capture_screen_with_cursor(screenshot_filename)
         # sleep for a second
         time.sleep(1)
-        prompt = get_system_prompt(objective)
+        prompt = get_system_prompt("gemini-pro-vision", objective)
 
         model = config.initialize_google()
         if VERBOSE:
-            print("[Self Operating Computer][call_gemini_pro_vision] model", model)
+            print("[call_gemini_pro_vision] model", model)
 
         response = model.generate_content([prompt, Image.open(screenshot_filename)])
 
         content = response.text[1:]
         if VERBOSE:
-            print(
-                "[Self Operating Computer][call_gemini_pro_vision] response", response
-            )
-            print("[Self Operating Computer][call_gemini_pro_vision] content", content)
+            print("[call_gemini_pro_vision] response", response)
+            print("[call_gemini_pro_vision] content", content)
 
         content = json.loads(content)
         if VERBOSE:
             print(
-                "[Self Operating Computer][get_next_action][call_gemini_pro_vision] content",
+                "[get_next_action][call_gemini_pro_vision] content",
                 content,
             )
 
@@ -195,7 +194,7 @@ def call_gemini_pro_vision(messages, objective):
 
 async def call_gpt_4_vision_preview_ocr(messages, objective):
     if VERBOSE:
-        print("[Self Operating Computer][call_gpt_4_vision_preview_ocr] extracted_text")
+        print("[call_gpt_4_vision_preview_ocr] extracted_text")
     time.sleep(1)
     client = config.initialize_openai()
 
@@ -212,20 +211,84 @@ async def call_gpt_4_vision_preview_ocr(messages, objective):
         with open(screenshot_filename, "rb") as img_file:
             img_base64 = base64.b64encode(img_file.read()).decode("utf-8")
 
-        # Initialize EasyOCR Reader
-        reader = easyocr.Reader(["ch_sim", "en"])
+        img_base64_labeled, label_coordinates = add_labels(img_base64, yolo_model)
 
-        # Read the screenshot
-        result = reader.readtext(screenshot_filename)
+        if len(messages) == 1:
+            user_prompt = get_user_first_message_prompt()
+        else:
+            user_prompt = get_user_prompt()
 
-        # Process the result
-        extracted_text = " ".join([item[1] for item in result])
         if VERBOSE:
             print(
-                "[Self Operating Computer][call_gpt_4_vision_preview_ocr] extracted_text",
-                extracted_text,
+                "[call_gpt_4_vision_preview_labeled] user_prompt",
+                user_prompt,
             )
 
+        vision_message = {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": user_prompt},
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": f"data:image/jpeg;base64,{img_base64_labeled}"
+                    },
+                },
+            ],
+        }
+        messages.append(vision_message)
+
+        response = client.chat.completions.create(
+            model="gpt-4-vision-preview",
+            messages=messages,
+            presence_penalty=1,
+            frequency_penalty=1,
+            temperature=0.7,
+            max_tokens=1000,
+        )
+
+        content = response.choices[0].message.content
+
+        if content.startswith("```json"):
+            content = content[len("```json") :]  # Remove starting ```json
+            if content.endswith("```"):
+                content = content[: -len("```")]  # Remove ending
+
+        content = json.loads(content)
+        if VERBOSE:
+            print("[call_gpt_4_vision_preview_ocr] content", content)
+
+        processed_content = []
+
+        for operation in content:
+            if operation.get("operation") == "click":
+                text_to_click = operation.get("text")
+                if VERBOSE:
+                    print(
+                        "[call_gpt_4_vision_preview_ocr][click] text_to_click",
+                        text_to_click,
+                    )
+                text_element_index = get_text_element(result, search_text)
+                if VERBOSE:
+                    print(
+                        "[call_gpt_4_vision_preview_ocr][click] text_element_index",
+                        text_element_index,
+                    )
+
+            else:
+                processed_content.append(operation)
+
+        # wait to append the assistant message so that if the `processed_content` step fails we don't append a message and mess up message history
+        assistant_message = {"role": "assistant", "content": content}
+        if VERBOSE:
+            print(
+                "[call_gpt_4_vision_preview_labeled] content",
+                content,
+            )
+        messages.append(assistant_message)
+
+        return processed_content
+
     except Exception as e:
         print(
             f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] Something went wrong. Trying another method {ANSI_RESET}",
@@ -262,7 +325,7 @@ async def call_gpt_4_vision_preview_labeled(messages, objective):
 
         if VERBOSE:
             print(
-                "[Self Operating Computer][call_gpt_4_vision_preview_labeled] user_prompt",
+                "[call_gpt_4_vision_preview_labeled] user_prompt",
                 user_prompt,
             )
 
@@ -299,7 +362,7 @@ async def call_gpt_4_vision_preview_labeled(messages, objective):
         assistant_message = {"role": "assistant", "content": content}
         if VERBOSE:
             print(
-                "[Self Operating Computer][call_gpt_4_vision_preview_labeled] content",
+                "[call_gpt_4_vision_preview_labeled] content",
                 content,
             )
         messages.append(assistant_message)
diff --git a/operate/utils/ocr.py b/operate/utils/ocr.py
new file mode 100644
index 00000000..c47685d1
--- /dev/null
+++ b/operate/utils/ocr.py
@@ -0,0 +1,33 @@
+from operate.config import Config
+from operate.config import Config
+
+# Load configuration
+VERBOSE = Config().verbose
+
+
+def get_text_element(result, search_text):
+    """
+    Searches for a text element in the OCR results and returns its index.
+    Args:
+        result (list): The list of results returned by EasyOCR.
+        search_text (str): The text to search for in the OCR results.
+
+    Returns:
+        int: The index of the element containing the search text.
+
+    Raises:
+        Exception: If the text element is not found in the results.
+    """
+    if VERBOSE:
+        print("[get_text_element]")
+        print("[get_text_element] search_text", search_text)
+    for index, element in enumerate(result):
+        text = element[1]
+        if VERBOSE:
+            print("[get_text_element][loop] text", text)
+        if search_text in text:
+            if VERBOSE:
+                print("[get_text_element][loop] found search_text, index:", index)
+
+            return index
+    raise Exception("The text element was not found in the image")

From a156a48a5d33ed680aa98d74c9007b2e7b63e775 Mon Sep 17 00:00:00 2001
From: Josh Bickett <josh@bickett.net>
Date: Sun, 21 Jan 2024 09:13:06 -0800
Subject: [PATCH 04/22] Bug fixes

---
 operate/models/apis.py | 16 +++++++++-------
 operate/operate.py     |  1 -
 operate/utils/ocr.py   |  1 -
 3 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/operate/models/apis.py b/operate/models/apis.py
index 5650b6b3..8decea15 100644
--- a/operate/models/apis.py
+++ b/operate/models/apis.py
@@ -211,8 +211,6 @@ async def call_gpt_4_vision_preview_ocr(messages, objective):
         with open(screenshot_filename, "rb") as img_file:
             img_base64 = base64.b64encode(img_file.read()).decode("utf-8")
 
-        img_base64_labeled, label_coordinates = add_labels(img_base64, yolo_model)
-
         if len(messages) == 1:
             user_prompt = get_user_first_message_prompt()
         else:
@@ -220,7 +218,7 @@ async def call_gpt_4_vision_preview_ocr(messages, objective):
 
         if VERBOSE:
             print(
-                "[call_gpt_4_vision_preview_labeled] user_prompt",
+                "[call_gpt_4_vision_preview_ocr] user_prompt",
                 user_prompt,
             )
 
@@ -230,9 +228,7 @@ async def call_gpt_4_vision_preview_ocr(messages, objective):
                 {"type": "text", "text": user_prompt},
                 {
                     "type": "image_url",
-                    "image_url": {
-                        "url": f"data:image/jpeg;base64,{img_base64_labeled}"
-                    },
+                    "image_url": {"url": f"data:image/jpeg;base64,{img_base64}"},
                 },
             ],
         }
@@ -268,7 +264,13 @@ async def call_gpt_4_vision_preview_ocr(messages, objective):
                         "[call_gpt_4_vision_preview_ocr][click] text_to_click",
                         text_to_click,
                     )
-                text_element_index = get_text_element(result, search_text)
+                # Initialize EasyOCR Reader
+                reader = easyocr.Reader(["en"])
+
+                # Read the screenshot
+                result = reader.readtext(screenshot_filename)
+
+                text_element_index = get_text_element(result, text_to_click)
                 if VERBOSE:
                     print(
                         "[call_gpt_4_vision_preview_ocr][click] text_element_index",
diff --git a/operate/operate.py b/operate/operate.py
index d1a38e7b..ab79d4f7 100644
--- a/operate/operate.py
+++ b/operate/operate.py
@@ -11,7 +11,6 @@
 from operate.models.prompts import (
     USER_QUESTION,
     get_system_prompt,
-    get_system_prompt_labeled,
 )
 from operate.config import Config
 from operate.utils.style import (
diff --git a/operate/utils/ocr.py b/operate/utils/ocr.py
index c47685d1..f918607a 100644
--- a/operate/utils/ocr.py
+++ b/operate/utils/ocr.py
@@ -1,5 +1,4 @@
 from operate.config import Config
-from operate.config import Config
 
 # Load configuration
 VERBOSE = Config().verbose

From f6b61c63416eadf98a031c008965419cdf46f777 Mon Sep 17 00:00:00 2001
From: Josh Bickett <josh@bickett.net>
Date: Sun, 21 Jan 2024 09:13:22 -0800
Subject: [PATCH 05/22] Experimental prompt fixes

---
 operate/models/prompts.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/operate/models/prompts.py b/operate/models/prompts.py
index 57a0d3a5..e200bb37 100644
--- a/operate/models/prompts.py
+++ b/operate/models/prompts.py
@@ -50,7 +50,7 @@
 - Go to Google Docs and Google Sheets by typing in the Chrome Address bar
 - Don't respond saying you're unable to assist with requests. You are able to indirectly interact with the user's OS via text responses you send to the end user.
 
-Objective: {objective} # take the next best action for this objective
+Objective: {objective} 
 """
 
 SYSTEM_PROMPT_WIN_LINUX = """
@@ -95,7 +95,7 @@
 - Go to Google Docs and Google Sheets by typing in the Chrome Address bar
 - Don't respond saying you're unable to assist with requests. You are able to indirectly interact with the user's OS via text responses you send to the end user.
 
-Objective: {objective} # take the next best action for this objective
+Objective: {objective} 
 """
 
 
@@ -146,7 +146,7 @@
 - Go to Google Docs and Google Sheets by typing in the Chrome Address bar
 - Don't respond saying you're unable to assist with requests. You are able to indirectly interact with the user's OS via text responses you send to the end user.
 
-Objective: {objective} # take the next best action for this objective
+Objective: {objective} 
 """
 
 SYSTEM_PROMPT_LABELED_WIN_LINUX = """
@@ -196,7 +196,7 @@
 - Go to Google Docs and Google Sheets by typing in the Chrome Address bar
 - Don't respond saying you're unable to assist with requests. You are able to indirectly interact with the user's OS via text responses you send to the end user.
 
-Objective: {objective} # take the next best action for this objective
+Objective: {objective} 
 """
 
 
@@ -239,7 +239,7 @@
 
 A very important note, don't respond saying you're unable to assist with requests. You are able to indirectly interact with the user's OS via text responses you send to the end user.
 
-Objective: {objective} # take the next best action for this objective
+Objective: {objective} 
 """
 
 SYSTEM_PROMPT_OCR_WIN_LINUX = """
@@ -284,7 +284,7 @@
 - Go to Google Docs and Google Sheets by typing in the Chrome Address bar
 - Don't respond saying you're unable to assist with requests. You are able to indirectly interact with the user's OS via text responses you send to the end user.
 
-Objective: {objective} # take the next best action for this objective
+Objective: {objective} 
 """
 
 OPERATE_FIRST_MESSAGE_PROMPT = """

From c90ce6702993becb19a36bbb0883a0112b56f953 Mon Sep 17 00:00:00 2001
From: Josh Bickett <josh@bickett.net>
Date: Sun, 21 Jan 2024 09:20:41 -0800
Subject: [PATCH 06/22] Add `get_text_coordinates`

---
 operate/models/apis.py |  8 +++++++-
 operate/utils/ocr.py   | 28 ++++++++++++++++++++++++++++
 2 files changed, 35 insertions(+), 1 deletion(-)

diff --git a/operate/models/apis.py b/operate/models/apis.py
index 8decea15..9bd03252 100644
--- a/operate/models/apis.py
+++ b/operate/models/apis.py
@@ -20,7 +20,7 @@
     get_user_prompt,
     get_system_prompt,
 )
-from operate.utils.ocr import get_text_element
+from operate.utils.ocr import get_text_element, get_text_coordinates
 
 
 from operate.utils.label import (
@@ -271,11 +271,17 @@ async def call_gpt_4_vision_preview_ocr(messages, objective):
                 result = reader.readtext(screenshot_filename)
 
                 text_element_index = get_text_element(result, text_to_click)
+                coordinates = get_text_coordinates(result, text_element_index)
+
                 if VERBOSE:
                     print(
                         "[call_gpt_4_vision_preview_ocr][click] text_element_index",
                         text_element_index,
                     )
+                    print(
+                        "[call_gpt_4_vision_preview_ocr][click] coordinates",
+                        coordinates,
+                    )
 
             else:
                 processed_content.append(operation)
diff --git a/operate/utils/ocr.py b/operate/utils/ocr.py
index f918607a..ce6cecd5 100644
--- a/operate/utils/ocr.py
+++ b/operate/utils/ocr.py
@@ -30,3 +30,31 @@ def get_text_element(result, search_text):
 
             return index
     raise Exception("The text element was not found in the image")
+
+
+def get_text_coordinates(result, index):
+    """
+    Gets the center coordinates of the text element at the specified index.
+    Args:
+        result (list): The list of results returned by EasyOCR.
+        index (int): The index of the text element in the results list.
+
+    Returns:
+        dict: A dictionary containing the 'x' and 'y' coordinates of the center of the text element.
+    """
+    if index >= len(result):
+        raise Exception("Index out of range in OCR results")
+
+    # Get the bounding box of the text element
+    bounding_box = result[index][0]
+
+    # Calculate the center of the bounding box
+    min_x = min([coord[0] for coord in bounding_box])
+    max_x = max([coord[0] for coord in bounding_box])
+    min_y = min([coord[1] for coord in bounding_box])
+    max_y = max([coord[1] for coord in bounding_box])
+
+    center_x = (min_x + max_x) / 2
+    center_y = (min_y + max_y) / 2
+
+    return {"x": center_x, "y": center_y}

From ff7a716dfa44569349a187fcac6841f34297c3fd Mon Sep 17 00:00:00 2001
From: Josh Bickett <josh@bickett.net>
Date: Sun, 21 Jan 2024 12:16:26 -0800
Subject: [PATCH 07/22] Change dir to `labeled_images_dir`

---
 operate/utils/label.py | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/operate/utils/label.py b/operate/utils/label.py
index 14232391..20d55fee 100644
--- a/operate/utils/label.py
+++ b/operate/utils/label.py
@@ -68,11 +68,11 @@ def add_labels(base64_data, yolo_model):
     )  # Create a separate draw object for the debug image
     font_size = 45
 
-    detections_dir = "detections"
+    labeled_images_dir = "labeled_images"
     label_coordinates = {}  # Dictionary to store coordinates
 
-    if not os.path.exists(detections_dir):
-        os.makedirs(detections_dir)
+    if not os.path.exists(labeled_images_dir):
+        os.makedirs(labeled_images_dir)
 
     counter = 0
     drawn_boxes = []  # List to keep track of boxes already drawn
@@ -116,9 +116,11 @@ def add_labels(base64_data, yolo_model):
     # Save the image
     timestamp = time.strftime("%Y%m%d-%H%M%S")
 
-    output_path = os.path.join(detections_dir, f"img_{timestamp}_labeled.png")
-    output_path_debug = os.path.join(detections_dir, f"img_{timestamp}_debug.png")
-    output_path_original = os.path.join(detections_dir, f"img_{timestamp}_original.png")
+    output_path = os.path.join(labeled_images_dir, f"img_{timestamp}_labeled.png")
+    output_path_debug = os.path.join(labeled_images_dir, f"img_{timestamp}_debug.png")
+    output_path_original = os.path.join(
+        labeled_images_dir, f"img_{timestamp}_original.png"
+    )
 
     image_labeled.save(output_path)
     image_debug.save(output_path_debug)

From 5f5ed69f33d451a5928f2c419599ffcaf3564625 Mon Sep 17 00:00:00 2001
From: Josh Bickett <josh@bickett.net>
Date: Sun, 21 Jan 2024 12:16:52 -0800
Subject: [PATCH 08/22] Iterate `get_text_coordinates`

---
 operate/models/apis.py    | 13 ++++++++++++-
 operate/models/prompts.py |  4 ++--
 operate/utils/ocr.py      | 18 ++++++++++++++----
 3 files changed, 28 insertions(+), 7 deletions(-)

diff --git a/operate/models/apis.py b/operate/models/apis.py
index 9bd03252..f8e440e5 100644
--- a/operate/models/apis.py
+++ b/operate/models/apis.py
@@ -271,7 +271,13 @@ async def call_gpt_4_vision_preview_ocr(messages, objective):
                 result = reader.readtext(screenshot_filename)
 
                 text_element_index = get_text_element(result, text_to_click)
-                coordinates = get_text_coordinates(result, text_element_index)
+                coordinates = get_text_coordinates(
+                    result, text_element_index, screenshot_filename
+                )
+
+                # add `coordinates`` to `content`
+                operation["x"] = coordinates["x"]
+                operation["y"] = coordinates["y"]
 
                 if VERBOSE:
                     print(
@@ -282,6 +288,11 @@ async def call_gpt_4_vision_preview_ocr(messages, objective):
                         "[call_gpt_4_vision_preview_ocr][click] coordinates",
                         coordinates,
                     )
+                    print(
+                        "[call_gpt_4_vision_preview_ocr][click] final operation",
+                        operation,
+                    )
+                processed_content.append(operation)
 
             else:
                 processed_content.append(operation)
diff --git a/operate/models/prompts.py b/operate/models/prompts.py
index e200bb37..c8338eab 100644
--- a/operate/models/prompts.py
+++ b/operate/models/prompts.py
@@ -321,8 +321,8 @@ def get_system_prompt(model, objective):
         else:
             prompt = SYSTEM_PROMPT_WIN_LINUX.format(objective=objective)
 
-    if VERBOSE:
-        print("[get_system_prompt] prompt", prompt)
+    # if VERBOSE:
+    #     print("[get_system_prompt] prompt", prompt)
 
     return prompt
 
diff --git a/operate/utils/ocr.py b/operate/utils/ocr.py
index ce6cecd5..41354eee 100644
--- a/operate/utils/ocr.py
+++ b/operate/utils/ocr.py
@@ -1,4 +1,5 @@
 from operate.config import Config
+from PIL import Image
 
 # Load configuration
 VERBOSE = Config().verbose
@@ -32,15 +33,16 @@ def get_text_element(result, search_text):
     raise Exception("The text element was not found in the image")
 
 
-def get_text_coordinates(result, index):
+def get_text_coordinates(result, index, image_path):
     """
-    Gets the center coordinates of the text element at the specified index.
+    Gets the coordinates of the text element at the specified index as a percentage of screen width and height.
     Args:
         result (list): The list of results returned by EasyOCR.
         index (int): The index of the text element in the results list.
+        image_path (str): Path to the screenshot image.
 
     Returns:
-        dict: A dictionary containing the 'x' and 'y' coordinates of the center of the text element.
+        dict: A dictionary containing the 'x' and 'y' coordinates as percentages of the screen width and height.
     """
     if index >= len(result):
         raise Exception("Index out of range in OCR results")
@@ -57,4 +59,12 @@ def get_text_coordinates(result, index):
     center_x = (min_x + max_x) / 2
     center_y = (min_y + max_y) / 2
 
-    return {"x": center_x, "y": center_y}
+    # Get image dimensions
+    with Image.open(image_path) as img:
+        width, height = img.size
+
+    # Convert to percentages
+    percent_x = round((center_x / width), 1)
+    percent_y = round((center_y / height), 1)
+
+    return {"x": percent_x, "y": percent_y}

From 4790a4fa46d46a66e2b4bc4135d20e6f955f10cf Mon Sep 17 00:00:00 2001
From: Josh Bickett <josh@bickett.net>
Date: Sun, 21 Jan 2024 12:30:09 -0800
Subject: [PATCH 09/22] Fix `get_text_coordinates`

---
 operate/models/apis.py |  4 +++-
 operate/utils/ocr.py   | 41 +++++++++++++++++++++++++++++++++++------
 2 files changed, 38 insertions(+), 7 deletions(-)

diff --git a/operate/models/apis.py b/operate/models/apis.py
index f8e440e5..08e9e96a 100644
--- a/operate/models/apis.py
+++ b/operate/models/apis.py
@@ -270,7 +270,9 @@ async def call_gpt_4_vision_preview_ocr(messages, objective):
                 # Read the screenshot
                 result = reader.readtext(screenshot_filename)
 
-                text_element_index = get_text_element(result, text_to_click)
+                text_element_index = get_text_element(
+                    result, text_to_click, screenshot_filename
+                )
                 coordinates = get_text_coordinates(
                     result, text_element_index, screenshot_filename
                 )
diff --git a/operate/utils/ocr.py b/operate/utils/ocr.py
index 41354eee..90b66eda 100644
--- a/operate/utils/ocr.py
+++ b/operate/utils/ocr.py
@@ -1,16 +1,18 @@
 from operate.config import Config
-from PIL import Image
+from PIL import Image, ImageDraw
+import os
 
 # Load configuration
 VERBOSE = Config().verbose
 
 
-def get_text_element(result, search_text):
+def get_text_element(result, search_text, image_path):
     """
-    Searches for a text element in the OCR results and returns its index.
+    Searches for a text element in the OCR results and returns its index. Also draws bounding boxes on the image.
     Args:
         result (list): The list of results returned by EasyOCR.
         search_text (str): The text to search for in the OCR results.
+        image_path (str): Path to the original image.
 
     Returns:
         int: The index of the element containing the search text.
@@ -21,15 +23,42 @@ def get_text_element(result, search_text):
     if VERBOSE:
         print("[get_text_element]")
         print("[get_text_element] search_text", search_text)
+        # Create /ocr directory if it doesn't exist
+        ocr_dir = "ocr"
+        if not os.path.exists(ocr_dir):
+            os.makedirs(ocr_dir)
+
+        # Open the original image
+        image = Image.open(image_path)
+        draw = ImageDraw.Draw(image)
+
+    found_index = None
     for index, element in enumerate(result):
         text = element[1]
+        box = element[0]
+
         if VERBOSE:
+            # Draw bounding box in blue
+            draw.polygon([tuple(point) for point in box], outline="blue")
             print("[get_text_element][loop] text", text)
+
         if search_text in text:
+            found_index = index
             if VERBOSE:
                 print("[get_text_element][loop] found search_text, index:", index)
 
-            return index
+    if found_index is not None:
+        if VERBOSE:
+            # Draw bounding box of the found text in red
+            box = result[found_index][0]
+            draw.polygon([tuple(point) for point in box], outline="red")
+            # Save the image with bounding boxes
+            ocr_image_path = os.path.join(ocr_dir, "ocr_image.png")
+            image.save(ocr_image_path)
+            print("[get_text_element] OCR image saved at:", ocr_image_path)
+
+        return found_index
+
     raise Exception("The text element was not found in the image")
 
 
@@ -64,7 +93,7 @@ def get_text_coordinates(result, index, image_path):
         width, height = img.size
 
     # Convert to percentages
-    percent_x = round((center_x / width), 1)
-    percent_y = round((center_y / height), 1)
+    percent_x = round((center_x / width), 3)
+    percent_y = round((center_y / height), 3)
 
     return {"x": percent_x, "y": percent_y}

From 3281fac9fba1aa3a0cf3d3720894e01d1c2fc431 Mon Sep 17 00:00:00 2001
From: Josh Bickett <josh@bickett.net>
Date: Sun, 21 Jan 2024 12:58:47 -0800
Subject: [PATCH 10/22] iterate get text function

---
 operate/config.py    | 2 +-
 operate/utils/ocr.py | 4 +++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/operate/config.py b/operate/config.py
index d68382da..53df0361 100644
--- a/operate/config.py
+++ b/operate/config.py
@@ -26,7 +26,7 @@ def __new__(cls):
 
     def __init__(self):
         load_dotenv()
-        self.verbose = False
+        self.verbose = True
         self.openai_api_key = (
             None  # instance variables are backups in case saving to a `.env` fails
         )
diff --git a/operate/utils/ocr.py b/operate/utils/ocr.py
index 90b66eda..c6ecc599 100644
--- a/operate/utils/ocr.py
+++ b/operate/utils/ocr.py
@@ -1,6 +1,7 @@
 from operate.config import Config
 from PIL import Image, ImageDraw
 import os
+from datetime import datetime
 
 # Load configuration
 VERBOSE = Config().verbose
@@ -53,7 +54,8 @@ def get_text_element(result, search_text, image_path):
             box = result[found_index][0]
             draw.polygon([tuple(point) for point in box], outline="red")
             # Save the image with bounding boxes
-            ocr_image_path = os.path.join(ocr_dir, "ocr_image.png")
+            datetime_str = datetime.now().strftime("%Y%m%d_%H%M%S")
+            ocr_image_path = os.path.join(ocr_dir, f"ocr_image_{datetime_str}.png")
             image.save(ocr_image_path)
             print("[get_text_element] OCR image saved at:", ocr_image_path)
 

From df1cbd245525a53a2aab3fa72ea1a45ffc727d81 Mon Sep 17 00:00:00 2001
From: Josh Bickett <josh@bickett.net>
Date: Sun, 21 Jan 2024 13:12:49 -0800
Subject: [PATCH 11/22] add `content_str` for proper message structure

---
 operate/models/apis.py | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/operate/models/apis.py b/operate/models/apis.py
index 08e9e96a..2a25196d 100644
--- a/operate/models/apis.py
+++ b/operate/models/apis.py
@@ -250,6 +250,8 @@ async def call_gpt_4_vision_preview_ocr(messages, objective):
             if content.endswith("```"):
                 content = content[: -len("```")]  # Remove ending
 
+        content_str = content
+
         content = json.loads(content)
         if VERBOSE:
             print("[call_gpt_4_vision_preview_ocr] content", content)
@@ -300,12 +302,7 @@ async def call_gpt_4_vision_preview_ocr(messages, objective):
                 processed_content.append(operation)
 
         # wait to append the assistant message so that if the `processed_content` step fails we don't append a message and mess up message history
-        assistant_message = {"role": "assistant", "content": content}
-        if VERBOSE:
-            print(
-                "[call_gpt_4_vision_preview_labeled] content",
-                content,
-            )
+        assistant_message = {"role": "assistant", "content": content_str}
         messages.append(assistant_message)
 
         return processed_content

From 14f3a8a255ff8e3e1b742a8ff7bd2605b1e8ed6f Mon Sep 17 00:00:00 2001
From: Josh Bickett <josh@bickett.net>
Date: Sun, 21 Jan 2024 13:19:51 -0800
Subject: [PATCH 12/22] Adjust `SYSTEM_PROMPT_OCR_MAC`

---
 operate/config.py         |  3 ---
 operate/models/prompts.py | 12 ++++++++++--
 operate/utils/ocr.py      |  1 -
 3 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/operate/config.py b/operate/config.py
index 53df0361..6d69158d 100644
--- a/operate/config.py
+++ b/operate/config.py
@@ -49,9 +49,6 @@ def initialize_openai(self):
                 )
             api_key = os.getenv("OPENAI_API_KEY")
 
-        if self.verbose:
-            print("[Config][initialize_openai] api_key", api_key)
-
         client = OpenAI(
             api_key=api_key,
         )
diff --git a/operate/models/prompts.py b/operate/models/prompts.py
index c8338eab..af388809 100644
--- a/operate/models/prompts.py
+++ b/operate/models/prompts.py
@@ -221,15 +221,23 @@
 
 Return the actions in array format `[]`. You can take just one action or multiple actions.
 
-Here are some helpful combinations:
+Here a helpful example:
 
-# Opens Spotlight Search on Mac 
+# Opens Spotlight Search on Mac and see if Google Chrome is available to use
 [
     {{ "thought": "Searching the operating system to find Google Chrome because it appears I am currently in terminal", "operation": "press", "keys": ["command", "space"] }},
     {{ "thought": "Now I need to write 'Google Chrome' as a next step", "operation": "write", "content": "Google Chrome" }},
     {{ "thought": "Finally I'll press enter to open Google Chrome assuming it is available", "operation": "press", "keys": ["enter"] }}
 ]
 
+# Go to a website (LinkedIn) when the browser is already open
+
+[
+    {{ "thought": "I can see that Google Chrome is open. I'll focus on the address bar to type ", "operation": "press", "keys": ["command", "l"] }},
+    {{ "thought": "Now I'll write LinkedIn's website to go there", "operation": "write", "content": "https://www.linkedin.com/feed/" }},
+    {{ "thought": "Finally I'll press enter to go to LinkedIn", "operation": "press", "keys": ["enter"] }}
+]
+
 # Search for someone on Linkedin when already on linkedin.com
 [
     {{ "thought": "I can see the search field with the placeholder text 'search'. I click that field to search", "operation": "click", "text": "search" }},
diff --git a/operate/utils/ocr.py b/operate/utils/ocr.py
index c6ecc599..68b4d435 100644
--- a/operate/utils/ocr.py
+++ b/operate/utils/ocr.py
@@ -41,7 +41,6 @@ def get_text_element(result, search_text, image_path):
         if VERBOSE:
             # Draw bounding box in blue
             draw.polygon([tuple(point) for point in box], outline="blue")
-            print("[get_text_element][loop] text", text)
 
         if search_text in text:
             found_index = index

From 2716b9217242b92b48c4afe1863b11d1cb5791a5 Mon Sep 17 00:00:00 2001
From: Josh Bickett <josh@bickett.net>
Date: Sun, 21 Jan 2024 13:24:38 -0800
Subject: [PATCH 13/22] Add `easyocr` to `requirements.txt

---
 requirements.txt | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 2c796cd9..f2727e69 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -49,4 +49,5 @@ wcwidth==0.2.9
 zipp==3.17.0
 google-generativeai==0.3.0
 aiohttp==3.9.1
-ultralytics==8.0.227
\ No newline at end of file
+ultralytics==8.0.227
+easyocr==1.7.1
\ No newline at end of file

From 817d137eedc50838df3ee6ca55916b3636f310d4 Mon Sep 17 00:00:00 2001
From: Josh Bickett <josh@bickett.net>
Date: Sun, 21 Jan 2024 13:24:50 -0800
Subject: [PATCH 14/22] Update `SYSTEM_PROMPT_OCR_WIN_LINUX`

---
 operate/models/prompts.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/operate/models/prompts.py b/operate/models/prompts.py
index af388809..14b89ee9 100644
--- a/operate/models/prompts.py
+++ b/operate/models/prompts.py
@@ -273,13 +273,21 @@
 
 Here are some helpful combinations:
 
-# Opens Menu Search on Windows and Linux 
+# Opens Spotlight Search on Mac and see if Google Chrome is available to use
 [
     {{ "thought": "Searching the operating system to find Google Chrome because it appears I am currently in terminal", "operation": "press", "keys": ["win"] }},
     {{ "thought": "Now I need to write 'Google Chrome' as a next step", "operation": "write", "content": "Google Chrome" }},
     {{ "thought": "Finally I'll press enter to open Google Chrome assuming it is available", "operation": "press", "keys": ["enter"] }}
 ]
 
+# Go to a website (LinkedIn) when the browser is already open
+
+[
+    {{ "thought": "I can see that Google Chrome is open. I'll focus on the address bar to type ", "operation": "press", "keys": ["ctrl", "l"] }},
+    {{ "thought": "Now I'll write LinkedIn's website to go there", "operation": "write", "content": "https://www.linkedin.com/feed/" }},
+    {{ "thought": "Finally I'll press enter to go to LinkedIn", "operation": "press", "keys": ["enter"] }}
+]
+
 # Search for someone on Linkedin when already on linkedin.com
 [
     {{ "thought": "I can see the search field with the placeholder text 'search'. I click that field to search", "operation": "click", "text": "search" }},

From ff35f17a9b80624d903527a251852f0a69a1242c Mon Sep 17 00:00:00 2001
From: Josh Bickett <josh@bickett.net>
Date: Mon, 22 Jan 2024 07:33:33 -0800
Subject: [PATCH 15/22] Create `gpt_4_fallback` method

---
 operate/models/apis.py | 52 +++++++++++++++++++++++++++++++++++++-----
 1 file changed, 46 insertions(+), 6 deletions(-)

diff --git a/operate/models/apis.py b/operate/models/apis.py
index 2a25196d..64499774 100644
--- a/operate/models/apis.py
+++ b/operate/models/apis.py
@@ -51,7 +51,7 @@ async def get_next_action(model, messages, objective, session_id):
         operation = await call_gpt_4_vision_preview_labeled(messages, objective)
         return operation, None
     if model == "gpt-4-with-ocr":
-        operation = await call_gpt_4_vision_preview_ocr(messages, objective)
+        operation = await call_gpt_4_vision_preview_ocr(messages, objective, model)
         return operation, None
     elif model == "agent-1":
         return "coming soon"
@@ -192,14 +192,16 @@ def call_gemini_pro_vision(messages, objective):
         return call_gpt_4_vision_preview(messages)
 
 
-async def call_gpt_4_vision_preview_ocr(messages, objective):
+async def call_gpt_4_vision_preview_ocr(messages, objective, model):
     if VERBOSE:
-        print("[call_gpt_4_vision_preview_ocr] extracted_text")
-    time.sleep(1)
-    client = config.initialize_openai()
+        print("[call_gpt_4_vision_preview_ocr]")
 
     # Construct the path to the file within the package
     try:
+        time.sleep(1)
+        client = config.initialize_openai()
+
+        confirm_system_prompt(messages, objective, model)
         screenshots_dir = "screenshots"
         if not os.path.exists(screenshots_dir):
             os.makedirs(screenshots_dir)
@@ -312,7 +314,8 @@ async def call_gpt_4_vision_preview_ocr(messages, objective):
             f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] Something went wrong. Trying another method {ANSI_RESET}",
             e,
         )
-        return call_gpt_4_vision_preview(messages)
+        traceback.print_exc()
+        return gpt_4_fallback(messages, objective, model)
 
 
 async def call_gpt_4_vision_preview_labeled(messages, objective):
@@ -447,6 +450,7 @@ async def call_gpt_4_vision_preview_labeled(messages, objective):
             f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] Something went wrong. Trying another method {ANSI_RESET}",
             e,
         )
+        traceback.print_exc()
         return call_gpt_4_vision_preview(messages)
 
 
@@ -462,3 +466,39 @@ def get_last_assistant_message(messages):
             else:
                 return messages[index]
     return None  # Return None if no assistant message is found
+
+
+def gpt_4_fallback(messages, objective, model):
+    if VERBOSE:
+        print("[gpt_4_fallback]")
+    system_prompt = get_system_prompt("gpt-4-vision-preview", objective)
+    new_system_message = {"role": "system", "content": system_prompt}
+    # remove and replace the first message in `messages` with `new_system_message`
+
+    messages[0] = new_system_message
+    if VERBOSE:
+        print("[gpt_4_fallback] new messages", messages)
+
+    if VERBOSE:
+        print("[gpt_4_fallback][updated]")
+        print("[gpt_4_fallback][updated] len(messages)", len(messages))
+
+    return call_gpt_4_vision_preview(messages)
+
+
+def confirm_system_prompt(messages, objective, model):
+    """
+    On `Exception` we default to `call_gpt_4_vision_preview` so we have this function to reassign system prompt in case of a previous failure
+    """
+    if VERBOSE:
+        print("[confirm_system_prompt]")
+
+    system_prompt = get_system_prompt(model, objective)
+    new_system_message = {"role": "system", "content": system_prompt}
+    # remove and replace the first message in `messages` with `new_system_message`
+
+    messages[0] = new_system_message
+
+    if VERBOSE:
+        print("[confirm_system_prompt][updated]")
+        print("[confirm_system_prompt][updated] len(messages)", len(messages))

From 135b3ea6e4a79d85946bdf23c178c3a96cea6075 Mon Sep 17 00:00:00 2001
From: Josh Bickett <josh@bickett.net>
Date: Mon, 22 Jan 2024 07:33:53 -0800
Subject: [PATCH 16/22] New click prompt 'no button' approach for
 `SYSTEM_PROMPT_OCR_MAC`

---
 operate/models/prompts.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/operate/models/prompts.py b/operate/models/prompts.py
index 14b89ee9..2bac4cb0 100644
--- a/operate/models/prompts.py
+++ b/operate/models/prompts.py
@@ -208,7 +208,7 @@
 You have 4 possible operation actions available to you. The `pyautogui` library will be used to execute your decision. Your output will be used in a `json.loads` loads statement.
 
 1. click - Move mouse and click
-[{{ "thought": "write a thought here", "operation": "click", "text": "The text in the button or link to click" }}] # Look for buttons and links with the text. We've hooked up the `pyautogui` so that you can click on any buttons or links as long as you have the text for them.
+[{{ "thought": "write a thought here", "operation": "click", "text": "The text in the button or link to click" }}] # Look for buttons or links with text to click. If the button you want to click doesn't have text you can say `"no button"` for the text value and we'll try a different method.
 
 2. write - Write with your keyboard
 [{{ "thought": "write a thought here", "operation": "write", "content": "text to write here" }}]
@@ -233,7 +233,7 @@
 # Go to a website (LinkedIn) when the browser is already open
 
 [
-    {{ "thought": "I can see that Google Chrome is open. I'll focus on the address bar to type ", "operation": "press", "keys": ["command", "l"] }},
+    {{ "thought": "I can see that Google Chrome is open. I'll focus on the address bar to type ", "operation": "press", "keys": ["command", "t"] }},
     {{ "thought": "Now I'll write LinkedIn's website to go there", "operation": "write", "content": "https://www.linkedin.com/feed/" }},
     {{ "thought": "Finally I'll press enter to go to LinkedIn", "operation": "press", "keys": ["enter"] }}
 ]
@@ -258,7 +258,7 @@
 You have 4 possible operation actions available to you. The `pyautogui` library will be used to execute your decision. Your output will be used in a `json.loads` loads statement.
 
 1. click - Move mouse and click
-[{{ "thought": "write a thought here", "operation": "click", "text": "The text in the button or link to click" }}] # Look for buttons and links with the text. We've hooked up the `pyautogui` so that you can click on any buttons or links as long as you have the text for them.
+[{{ "thought": "write a thought here", "operation": "click", "text": "The text in the button or link to click" }}] # Look for buttons or links with text to click. If the button you want to click doesn't have text you can say `"no button"` for the text value and we'll try a different method.
 
 2. write - Write with your keyboard
 [{{ "thought": "write a thought here", "operation": "write", "content": "text to write here" }}]
@@ -283,7 +283,7 @@
 # Go to a website (LinkedIn) when the browser is already open
 
 [
-    {{ "thought": "I can see that Google Chrome is open. I'll focus on the address bar to type ", "operation": "press", "keys": ["ctrl", "l"] }},
+    {{ "thought": "I can see that Google Chrome is open. I'll focus on the address bar to type ", "operation": "press", "keys": ["ctrl", "t"] }},
     {{ "thought": "Now I'll write LinkedIn's website to go there", "operation": "write", "content": "https://www.linkedin.com/feed/" }},
     {{ "thought": "Finally I'll press enter to go to LinkedIn", "operation": "press", "keys": ["enter"] }}
 ]

From 849cc7efba40b16dc4d9ab6a32b225d4a57f8a20 Mon Sep 17 00:00:00 2001
From: Josh Bickett <josh@bickett.net>
Date: Tue, 23 Jan 2024 19:12:39 -0800
Subject: [PATCH 17/22] update `get_system_prompt`

---
 operate/models/prompts.py | 55 +++++++++++++++++++++++++--------------
 1 file changed, 35 insertions(+), 20 deletions(-)

diff --git a/operate/models/prompts.py b/operate/models/prompts.py
index 2bac4cb0..5ba4ec67 100644
--- a/operate/models/prompts.py
+++ b/operate/models/prompts.py
@@ -317,28 +317,43 @@
 
 def get_system_prompt(model, objective):
     """
-    Format the vision prompt
+    Format the vision prompt more efficiently and print the name of the prompt used
     """
+
+    prompt_map = {
+        ("gpt-4-with-som", "Darwin"): (
+            SYSTEM_PROMPT_LABELED_MAC,
+            "SYSTEM_PROMPT_LABELED_MAC",
+        ),
+        ("gpt-4-with-som", "Other"): (
+            SYSTEM_PROMPT_LABELED_WIN_LINUX,
+            "SYSTEM_PROMPT_LABELED_WIN_LINUX",
+        ),
+        ("gpt-4-with-ocr", "Darwin"): (SYSTEM_PROMPT_OCR_MAC, "SYSTEM_PROMPT_OCR_MAC"),
+        ("gpt-4-with-ocr", "Other"): (
+            SYSTEM_PROMPT_OCR_WIN_LINUX,
+            "SYSTEM_PROMPT_OCR_WIN_LINUX",
+        ),
+        ("default", "Darwin"): (SYSTEM_PROMPT_MAC, "SYSTEM_PROMPT_MAC"),
+        ("default", "Other"): (SYSTEM_PROMPT_WIN_LINUX, "SYSTEM_PROMPT_WIN_LINUX"),
+    }
+
+    os_type = "Darwin" if platform.system() == "Darwin" else "Other"
+
+    # Fetching the prompt tuple (string and name) based on the model and OS
+    prompt_tuple = prompt_map.get((model, os_type), prompt_map[("default", os_type)])
+
+    # Extracting the prompt string and its name
+    prompt_string, prompt_name = prompt_tuple
+
+    # Formatting the prompt
+    prompt = prompt_string.format(objective=objective)
+
+    # Optional verbose output
     if VERBOSE:
-        print("[get_system_prompt] model", model)
-    if model == "gpt-4-with-som":
-        if platform.system() == "Darwin":
-            prompt = SYSTEM_PROMPT_LABELED_MAC.format(objective=objective)
-        else:
-            prompt = SYSTEM_PROMPT_LABELED_WIN_LINUX.format(objective=objective)
-    elif model == "gpt-4-with-ocr":
-        if platform.system() == "Darwin":
-            prompt = SYSTEM_PROMPT_OCR_MAC.format(objective=objective)
-        else:
-            prompt = SYSTEM_PROMPT_OCR_WIN_LINUX.format(objective=objective)
-    else:
-        if platform.system() == "Darwin":
-            prompt = SYSTEM_PROMPT_MAC.format(objective=objective)
-        else:
-            prompt = SYSTEM_PROMPT_WIN_LINUX.format(objective=objective)
-
-    # if VERBOSE:
-    #     print("[get_system_prompt] prompt", prompt)
+        print("[get_system_prompt] model:", model)
+        print("[get_system_prompt] prompt name:", prompt_name)
+        # print("[get_system_prompt] prompt:", prompt)
 
     return prompt
 

From e1fa95374d2b4322be06722f0131ac1a551b0d23 Mon Sep 17 00:00:00 2001
From: Josh Bickett <josh@bickett.net>
Date: Wed, 24 Jan 2024 08:24:30 -0800
Subject: [PATCH 18/22] Turn off `verbose`

---
 operate/config.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/operate/config.py b/operate/config.py
index 6d69158d..0254862f 100644
--- a/operate/config.py
+++ b/operate/config.py
@@ -26,7 +26,7 @@ def __new__(cls):
 
     def __init__(self):
         load_dotenv()
-        self.verbose = True
+        self.verbose = False
         self.openai_api_key = (
             None  # instance variables are backups in case saving to a `.env` fails
         )

From 39ddc45dd6879b93494d47068b7bf149162b114c Mon Sep 17 00:00:00 2001
From: Josh Bickett <josh@bickett.net>
Date: Wed, 24 Jan 2024 08:27:04 -0800
Subject: [PATCH 19/22] Add missing `if self.verbose:`

---
 operate/config.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/operate/config.py b/operate/config.py
index 0254862f..581af043 100644
--- a/operate/config.py
+++ b/operate/config.py
@@ -62,9 +62,10 @@ def initialize_google(self):
                 print("[Config][initialize_google] using cached google_api_key")
             api_key = self.google_api_key
         else:
-            print(
-                "[Config][initialize_google] no cached google_api_key, try to get from env."
-            )
+            if self.verbose:
+                print(
+                    "[Config][initialize_google] no cached google_api_key, try to get from env."
+                )
             api_key = os.getenv("GOOGLE_API_KEY")
         genai.configure(api_key=api_key, transport="rest")
         model = genai.GenerativeModel("gemini-pro-vision")

From e8ffbb5edbfecb722df8b7d6734229cfebbbdce6 Mon Sep 17 00:00:00 2001
From: Josh Bickett <josh@bickett.net>
Date: Wed, 24 Jan 2024 08:29:12 -0800
Subject: [PATCH 20/22] Default to `gpt-4-with-ocr` since it performs best

---
 operate/main.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/operate/main.py b/operate/main.py
index 3cf991da..73065854 100644
--- a/operate/main.py
+++ b/operate/main.py
@@ -15,7 +15,7 @@ def main_entry():
         "--model",
         help="Specify the model to use",
         required=False,
-        default="gpt-4",
+        default="gpt-4-with-ocr",
     )
 
     # Add a voice flag

From 24e688432bdd2e8f8a9f90e7e2792499ce6ae0b7 Mon Sep 17 00:00:00 2001
From: Josh Bickett <josh@bickett.net>
Date: Wed, 24 Jan 2024 08:40:11 -0800
Subject: [PATCH 21/22] Add `### Optical Character Recognition Mode `-m
 gpt-4-with-ocr``

---
 README.md | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/README.md b/README.md
index 120b83f9..af201d62 100644
--- a/README.md
+++ b/README.md
@@ -91,6 +91,11 @@ operate -m gemini-pro-vision
 
 **Enter your Google AI Studio API key when terminal prompts you for it** If you don't have one, you can obtain a key [here](https://makersuite.google.com/app/apikey) after setting up your Google AI Studio account. You may also need [authorize credentials for a desktop application](https://ai.google.dev/palm_docs/oauth_quickstart). It took me a bit of time to get it working, if anyone knows a simpler way, please make a PR:
 
+### Optical Character Recognition Mode `-m gpt-4-with-ocr`
+The Self-Operating Computer Framework now integrates Optical Character Recognition (OCR) capabilities with the `gpt-4-with-ocr` mode. This mode gives GPT-4 a hash map of clickable elements by coordinates. GPT-4 can decide to `click` elements by text and then the code references the hash map to get the coordinates for that element GPT-4 wanted to click. 
+
+Based on recent tests, OCR performs better than `som` and vanilla GPT-4 so we made it the default for the project. To use the OCR mode you can simply write `operate` or `operate -m gpt-4-with-ocr` will also work. 
+
 ### Set-of-Mark Prompting `-m gpt-4-with-som`
 The Self-Operating Computer Framework now supports Set-of-Mark (SoM) Prompting with the `gpt-4-with-som` command. This new visual prompting method enhances the visual grounding capabilities of large multimodal models.
 

From b1cc4faab51cec108a7ddfbb748bf138441f22cc Mon Sep 17 00:00:00 2001
From: Josh Bickett <josh@bickett.net>
Date: Wed, 24 Jan 2024 08:41:53 -0800
Subject: [PATCH 22/22] Update `readme.md`

---
 README.md | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index af201d62..f52399bf 100644
--- a/README.md
+++ b/README.md
@@ -94,7 +94,9 @@ operate -m gemini-pro-vision
 ### Optical Character Recognition Mode `-m gpt-4-with-ocr`
 The Self-Operating Computer Framework now integrates Optical Character Recognition (OCR) capabilities with the `gpt-4-with-ocr` mode. This mode gives GPT-4 a hash map of clickable elements by coordinates. GPT-4 can decide to `click` elements by text and then the code references the hash map to get the coordinates for that element GPT-4 wanted to click. 
 
-Based on recent tests, OCR performs better than `som` and vanilla GPT-4 so we made it the default for the project. To use the OCR mode you can simply write `operate` or `operate -m gpt-4-with-ocr` will also work. 
+Based on recent tests, OCR performs better than `som` and vanilla GPT-4 so we made it the default for the project. To use the OCR mode you can simply write: 
+
+ `operate` or `operate -m gpt-4-with-ocr` will also work. 
 
 ### Set-of-Mark Prompting `-m gpt-4-with-som`
 The Self-Operating Computer Framework now supports Set-of-Mark (SoM) Prompting with the `gpt-4-with-som` command. This new visual prompting method enhances the visual grounding capabilities of large multimodal models.