Skip to content

Commit

Permalink
Merge pull request #145 from OthersideAI/add-ocr
Browse files Browse the repository at this point in the history
Add ocr
  • Loading branch information
joshbickett authored Jan 24, 2024
2 parents d0a972f + b1cc4fa commit fd0b8b3
Show file tree
Hide file tree
Showing 9 changed files with 453 additions and 57 deletions.
7 changes: 7 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,13 @@ operate -m gemini-pro-vision

**Enter your Google AI Studio API key when terminal prompts you for it** If you don't have one, you can obtain a key [here](https://makersuite.google.com/app/apikey) after setting up your Google AI Studio account. You may also need [authorize credentials for a desktop application](https://ai.google.dev/palm_docs/oauth_quickstart). It took me a bit of time to get it working, if anyone knows a simpler way, please make a PR:

### Optical Character Recognition Mode `-m gpt-4-with-ocr`
The Self-Operating Computer Framework now integrates Optical Character Recognition (OCR) capabilities with the `gpt-4-with-ocr` mode. This mode gives GPT-4 a hash map of clickable elements by coordinates. GPT-4 can decide to `click` elements by text and then the code references the hash map to get the coordinates for that element GPT-4 wanted to click.

Based on recent tests, OCR performs better than `som` and vanilla GPT-4 so we made it the default for the project. To use the OCR mode you can simply write:

`operate` or `operate -m gpt-4-with-ocr` will also work.

### Set-of-Mark Prompting `-m gpt-4-with-som`
The Self-Operating Computer Framework now supports Set-of-Mark (SoM) Prompting with the `gpt-4-with-som` command. This new visual prompting method enhances the visual grounding capabilities of large multimodal models.

Expand Down
10 changes: 4 additions & 6 deletions operate/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,9 +49,6 @@ def initialize_openai(self):
)
api_key = os.getenv("OPENAI_API_KEY")

if self.verbose:
print("[Config][initialize_openai] api_key", api_key)

client = OpenAI(
api_key=api_key,
)
Expand All @@ -65,9 +62,10 @@ def initialize_google(self):
print("[Config][initialize_google] using cached google_api_key")
api_key = self.google_api_key
else:
print(
"[Config][initialize_google] no cached google_api_key, try to get from env."
)
if self.verbose:
print(
"[Config][initialize_google] no cached google_api_key, try to get from env."
)
api_key = os.getenv("GOOGLE_API_KEY")
genai.configure(api_key=api_key, transport="rest")
model = genai.GenerativeModel("gemini-pro-vision")
Expand Down
2 changes: 1 addition & 1 deletion operate/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ def main_entry():
"--model",
help="Specify the model to use",
required=False,
default="gpt-4",
default="gpt-4-with-ocr",
)

# Add a voice flag
Expand Down
200 changes: 183 additions & 17 deletions operate/models/apis.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import base64
import traceback
import io
import easyocr


from PIL import Image
Expand All @@ -19,6 +20,7 @@
get_user_prompt,
get_system_prompt,
)
from operate.utils.ocr import get_text_element, get_text_coordinates


from operate.utils.label import (
Expand Down Expand Up @@ -48,6 +50,9 @@ async def get_next_action(model, messages, objective, session_id):
if model == "gpt-4-with-som":
operation = await call_gpt_4_vision_preview_labeled(messages, objective)
return operation, None
if model == "gpt-4-with-ocr":
operation = await call_gpt_4_vision_preview_ocr(messages, objective, model)
return operation, None
elif model == "agent-1":
return "coming soon"
elif model == "gemini-pro-vision":
Expand All @@ -58,7 +63,7 @@ async def get_next_action(model, messages, objective, session_id):

def call_gpt_4_vision_preview(messages):
if VERBOSE:
print("[Self Operating Computer][get_next_action][call_gpt_4_v]")
print("[call_gpt_4_v]")
time.sleep(1)
client = config.initialize_openai()
try:
Expand All @@ -80,7 +85,7 @@ def call_gpt_4_vision_preview(messages):

if VERBOSE:
print(
"[Self Operating Computer][get_next_action][call_gpt_4_v] user_prompt",
"[call_gpt_4_v] user_prompt",
user_prompt,
)

Expand Down Expand Up @@ -115,7 +120,7 @@ def call_gpt_4_vision_preview(messages):
assistant_message = {"role": "assistant", "content": content}
if VERBOSE:
print(
"[Self Operating Computer][get_next_action][call_gpt_4_v] content",
"[call_gpt_4_v] content",
content,
)
content = json.loads(content)
Expand Down Expand Up @@ -157,25 +162,23 @@ def call_gemini_pro_vision(messages, objective):
capture_screen_with_cursor(screenshot_filename)
# sleep for a second
time.sleep(1)
prompt = get_system_prompt(objective)
prompt = get_system_prompt("gemini-pro-vision", objective)

model = config.initialize_google()
if VERBOSE:
print("[Self Operating Computer][call_gemini_pro_vision] model", model)
print("[call_gemini_pro_vision] model", model)

response = model.generate_content([prompt, Image.open(screenshot_filename)])

content = response.text[1:]
if VERBOSE:
print(
"[Self Operating Computer][call_gemini_pro_vision] response", response
)
print("[Self Operating Computer][call_gemini_pro_vision] content", content)
print("[call_gemini_pro_vision] response", response)
print("[call_gemini_pro_vision] content", content)

content = json.loads(content)
if VERBOSE:
print(
"[Self Operating Computer][get_next_action][call_gemini_pro_vision] content",
"[get_next_action][call_gemini_pro_vision] content",
content,
)

Expand All @@ -189,6 +192,132 @@ def call_gemini_pro_vision(messages, objective):
return call_gpt_4_vision_preview(messages)


async def call_gpt_4_vision_preview_ocr(messages, objective, model):
if VERBOSE:
print("[call_gpt_4_vision_preview_ocr]")

# Construct the path to the file within the package
try:
time.sleep(1)
client = config.initialize_openai()

confirm_system_prompt(messages, objective, model)
screenshots_dir = "screenshots"
if not os.path.exists(screenshots_dir):
os.makedirs(screenshots_dir)

screenshot_filename = os.path.join(screenshots_dir, "screenshot.png")
# Call the function to capture the screen with the cursor
capture_screen_with_cursor(screenshot_filename)

with open(screenshot_filename, "rb") as img_file:
img_base64 = base64.b64encode(img_file.read()).decode("utf-8")

if len(messages) == 1:
user_prompt = get_user_first_message_prompt()
else:
user_prompt = get_user_prompt()

if VERBOSE:
print(
"[call_gpt_4_vision_preview_ocr] user_prompt",
user_prompt,
)

vision_message = {
"role": "user",
"content": [
{"type": "text", "text": user_prompt},
{
"type": "image_url",
"image_url": {"url": f"data:image/jpeg;base64,{img_base64}"},
},
],
}
messages.append(vision_message)

response = client.chat.completions.create(
model="gpt-4-vision-preview",
messages=messages,
presence_penalty=1,
frequency_penalty=1,
temperature=0.7,
max_tokens=1000,
)

content = response.choices[0].message.content

if content.startswith("```json"):
content = content[len("```json") :] # Remove starting ```json
if content.endswith("```"):
content = content[: -len("```")] # Remove ending

content_str = content

content = json.loads(content)
if VERBOSE:
print("[call_gpt_4_vision_preview_ocr] content", content)

processed_content = []

for operation in content:
if operation.get("operation") == "click":
text_to_click = operation.get("text")
if VERBOSE:
print(
"[call_gpt_4_vision_preview_ocr][click] text_to_click",
text_to_click,
)
# Initialize EasyOCR Reader
reader = easyocr.Reader(["en"])

# Read the screenshot
result = reader.readtext(screenshot_filename)

text_element_index = get_text_element(
result, text_to_click, screenshot_filename
)
coordinates = get_text_coordinates(
result, text_element_index, screenshot_filename
)

# add `coordinates`` to `content`
operation["x"] = coordinates["x"]
operation["y"] = coordinates["y"]

if VERBOSE:
print(
"[call_gpt_4_vision_preview_ocr][click] text_element_index",
text_element_index,
)
print(
"[call_gpt_4_vision_preview_ocr][click] coordinates",
coordinates,
)
print(
"[call_gpt_4_vision_preview_ocr][click] final operation",
operation,
)
processed_content.append(operation)

else:
processed_content.append(operation)

# wait to append the assistant message so that if the `processed_content` step fails we don't append a message and mess up message history
assistant_message = {"role": "assistant", "content": content_str}
messages.append(assistant_message)

return processed_content

except Exception as e:
print(
f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] Something went wrong. Trying another method {ANSI_RESET}",
e,
)
traceback.print_exc()
return gpt_4_fallback(messages, objective, model)


async def call_gpt_4_vision_preview_labeled(messages, objective):
time.sleep(1)
client = config.initialize_openai()
Expand Down Expand Up @@ -217,7 +346,7 @@ async def call_gpt_4_vision_preview_labeled(messages, objective):

if VERBOSE:
print(
"[Self Operating Computer][get_next_action][call_gpt_4_vision_preview_labeled] user_prompt",
"[call_gpt_4_vision_preview_labeled] user_prompt",
user_prompt,
)

Expand Down Expand Up @@ -254,7 +383,7 @@ async def call_gpt_4_vision_preview_labeled(messages, objective):
assistant_message = {"role": "assistant", "content": content}
if VERBOSE:
print(
"[Self Operating Computer][get_next_action][call_gpt_4_vision_preview_labeled] content",
"[call_gpt_4_vision_preview_labeled] content",
content,
)
messages.append(assistant_message)
Expand All @@ -268,14 +397,14 @@ async def call_gpt_4_vision_preview_labeled(messages, objective):
label = operation.get("label")
if VERBOSE:
print(
"[Self Operating Computer][get_next_action][call_gpt_4_vision_preview_labeled] label",
"[Self Operating Computer][call_gpt_4_vision_preview_labeled] label",
label,
)

coordinates = get_label_coordinates(label, label_coordinates)
if VERBOSE:
print(
"[Self Operating Computer][get_next_action][call_gpt_4_vision_preview_labeled] coordinates",
"[Self Operating Computer][call_gpt_4_vision_preview_labeled] coordinates",
coordinates,
)
image = Image.open(
Expand All @@ -287,7 +416,7 @@ async def call_gpt_4_vision_preview_labeled(messages, objective):
)
if VERBOSE:
print(
"[Self Operating Computer][get_next_action][call_gpt_4_vision_preview_labeled] click_position_percent",
"[Self Operating Computer][call_gpt_4_vision_preview_labeled] click_position_percent",
click_position_percent,
)
if not click_position_percent:
Expand All @@ -302,7 +431,7 @@ async def call_gpt_4_vision_preview_labeled(messages, objective):
operation["y"] = y_percent
if VERBOSE:
print(
"[Self Operating Computer][get_next_action][call_gpt_4_vision_preview_labeled] new click operation",
"[Self Operating Computer][call_gpt_4_vision_preview_labeled] new click operation",
operation,
)
processed_content.append(operation)
Expand All @@ -311,7 +440,7 @@ async def call_gpt_4_vision_preview_labeled(messages, objective):

if VERBOSE:
print(
"[Self Operating Computer][get_next_action][call_gpt_4_vision_preview_labeled] new processed_content",
"[Self Operating Computer][call_gpt_4_vision_preview_labeled] new processed_content",
processed_content,
)
return processed_content
Expand All @@ -321,6 +450,7 @@ async def call_gpt_4_vision_preview_labeled(messages, objective):
f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] Something went wrong. Trying another method {ANSI_RESET}",
e,
)
traceback.print_exc()
return call_gpt_4_vision_preview(messages)


Expand All @@ -336,3 +466,39 @@ def get_last_assistant_message(messages):
else:
return messages[index]
return None # Return None if no assistant message is found


def gpt_4_fallback(messages, objective, model):
if VERBOSE:
print("[gpt_4_fallback]")
system_prompt = get_system_prompt("gpt-4-vision-preview", objective)
new_system_message = {"role": "system", "content": system_prompt}
# remove and replace the first message in `messages` with `new_system_message`

messages[0] = new_system_message
if VERBOSE:
print("[gpt_4_fallback] new messages", messages)

if VERBOSE:
print("[gpt_4_fallback][updated]")
print("[gpt_4_fallback][updated] len(messages)", len(messages))

return call_gpt_4_vision_preview(messages)


def confirm_system_prompt(messages, objective, model):
"""
On `Exception` we default to `call_gpt_4_vision_preview` so we have this function to reassign system prompt in case of a previous failure
"""
if VERBOSE:
print("[confirm_system_prompt]")

system_prompt = get_system_prompt(model, objective)
new_system_message = {"role": "system", "content": system_prompt}
# remove and replace the first message in `messages` with `new_system_message`

messages[0] = new_system_message

if VERBOSE:
print("[confirm_system_prompt][updated]")
print("[confirm_system_prompt][updated] len(messages)", len(messages))
Loading

0 comments on commit fd0b8b3

Please sign in to comment.