Skip to content

Commit

Permalink
DEV: Extend truncation to all summarizable content (#884)
Browse files Browse the repository at this point in the history
  • Loading branch information
romanrizzi authored Oct 31, 2024
1 parent e8eed71 commit e8f0633
Show file tree
Hide file tree
Showing 5 changed files with 32 additions and 25 deletions.
28 changes: 23 additions & 5 deletions lib/summarization/fold_content.rb
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,10 @@ def initialize(llm, strategy, persist_summaries: true)
def summarize(user, &on_partial_blk)
base_summary = ""
initial_pos = 0
folded_summary =
fold(content_to_summarize, base_summary, initial_pos, user, &on_partial_blk)

truncated_content = content_to_summarize.map { |cts| truncate(cts) }

folded_summary = fold(truncated_content, base_summary, initial_pos, user, &on_partial_blk)

clean_summary =
Nokogiri::HTML5.fragment(folded_summary).css("ai")&.first&.text || folded_summary
Expand All @@ -37,7 +39,7 @@ def summarize(user, &on_partial_blk)
strategy.type,
llm_model.name,
clean_summary,
content_to_summarize.map { |c| c[:id] },
truncated_content.map { |c| c[:id] },
)
else
AiSummary.new(summarized_text: clean_summary)
Expand Down Expand Up @@ -121,9 +123,9 @@ def fold(items, summary, cursor, user, &on_partial_blk)
prompt =
(
if summary.blank?
strategy.first_summary_prompt(iteration_content, tokenizer)
strategy.first_summary_prompt(iteration_content)
else
strategy.summary_extension_prompt(summary, iteration_content, tokenizer)
strategy.summary_extension_prompt(summary, iteration_content)
end
)

Expand All @@ -143,6 +145,22 @@ def available_tokens

llm_model.max_prompt_tokens - reserved_tokens
end

def truncate(item)
item_content = item[:text].to_s
split_1, split_2 =
[item_content[0, item_content.size / 2], item_content[(item_content.size / 2)..-1]]

truncation_length = 500
tokenizer = llm_model.tokenizer_class

item[:text] = [
tokenizer.truncate(split_1, truncation_length),
tokenizer.truncate(split_2.reverse, truncation_length).reverse,
].join(" ")

item
end
end
end
end
4 changes: 2 additions & 2 deletions lib/summarization/strategies/base.rb
Original file line number Diff line number Diff line change
Expand Up @@ -34,12 +34,12 @@ def targets_data
end

# @returns { DiscourseAi::Completions::Prompt } - Prompt passed to the LLM when extending an existing summary.
def summary_extension_prompt(_summary, _texts_to_summarize, _tokenizer)
def summary_extension_prompt(_summary, _texts_to_summarize)
raise NotImplementedError
end

# @returns { DiscourseAi::Completions::Prompt } - Prompt passed to the LLM for summarizing a single chunk of content.
def first_summary_prompt(_input, _tokenizer)
def first_summary_prompt(_input)
raise NotImplementedError
end

Expand Down
4 changes: 2 additions & 2 deletions lib/summarization/strategies/chat_messages.rb
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def targets_data
.map { { id: _1, poster: _2, text: _3 } }
end

def summary_extension_prompt(summary, contents, _tokenizer)
def summary_extension_prompt(summary, contents)
input =
contents
.map { |item| "(#{item[:id]} #{item[:poster]} said: #{item[:text]} " }
Expand Down Expand Up @@ -63,7 +63,7 @@ def summary_extension_prompt(summary, contents, _tokenizer)
prompt
end

def first_summary_prompt(contents, _tokenizer)
def first_summary_prompt(contents)
content_title = target.name
input =
contents.map { |item| "(#{item[:id]} #{item[:poster]} said: #{item[:text]} " }.join
Expand Down
17 changes: 3 additions & 14 deletions lib/summarization/strategies/hot_topic_gists.rb
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ def targets_data
end
end

def summary_extension_prompt(summary, contents, _tokenizer)
def summary_extension_prompt(summary, contents)
statements =
contents
.to_a
Expand Down Expand Up @@ -98,22 +98,11 @@ def summary_extension_prompt(summary, contents, _tokenizer)
prompt
end

def first_summary_prompt(contents, tokenizer)
def first_summary_prompt(contents)
content_title = target.title
statements =
contents.to_a.map { |item| "(#{item[:id]} #{item[:poster]} said: #{item[:text]} " }

op_statement = statements.shift.to_s
split_1, split_2 =
[op_statement[0, op_statement.size / 2], op_statement[(op_statement.size / 2)..-1]]

truncation_length = 500

op_statement = [
tokenizer.truncate(split_1, truncation_length),
tokenizer.truncate(split_2.reverse, truncation_length).reverse,
].join(" ")

prompt = DiscourseAi::Completions::Prompt.new(<<~TEXT.strip)
You are an advanced summarization bot. Analyze a given conversation and produce a concise,
single-sentence summary that conveys the main topic and current developments to someone with no prior context.
Expand All @@ -138,7 +127,7 @@ def first_summary_prompt(contents, tokenizer)
The conversation began with the following statement:
#{op_statement}\n
#{statements.shift}\n
TEXT

if statements.present?
Expand Down
4 changes: 2 additions & 2 deletions lib/summarization/strategies/topic_summary.rb
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ def targets_data
end
end

def summary_extension_prompt(summary, contents, _tokenizer)
def summary_extension_prompt(summary, contents)
resource_path = "#{Discourse.base_path}/t/-/#{target.id}"
content_title = target.title
input =
Expand Down Expand Up @@ -70,7 +70,7 @@ def summary_extension_prompt(summary, contents, _tokenizer)
prompt
end

def first_summary_prompt(contents, _tokenizer)
def first_summary_prompt(contents)
resource_path = "#{Discourse.base_path}/t/-/#{target.id}"
content_title = target.title
input =
Expand Down

0 comments on commit e8f0633

Please sign in to comment.