check diff output to make sure it looks reasonable

This commit is contained in:
burnettk 2024-06-27 17:25:54 -04:00
parent 0ca371149f
commit 03beaf8936
No known key found for this signature in database
3 changed files with 35 additions and 9 deletions

View File

@ -84,7 +84,7 @@ There are four types of Gateways: Exclusive, Parallel, Inclusive, and Event-Base
## Intermediate Event
This is an event that occurs within the middle of a process, neither at the start nor the end.
This is an event that occurs in the middle of a process, neither at the start nor the end.
It can be connected to other tasks through connectors or placed on the border of a task.
It evaluates conditions and circumstances, triggering events and enabling the initiation of alternative paths within the process.
@ -99,7 +99,7 @@ These are subdivisions within a Pool that are utilized to assign activities to s
## Merge
This is the process in which two or more parallel Sequence Flow paths converge into a single path, achieved either through multiple incoming Sequence Flows or by utilizing an Exclusive Gateway.
This merging of paths is also commonly referred to as an "OR-Join".
This merging of paths is also commonly referred to as an "OR-Join."
## Message

View File

@ -69,20 +69,22 @@ def read_file(file_path):
with open(file_path, "r") as f:
return f.read()
def split_content(content, chunk_size=13000):
splitter = CharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=0)
return splitter.split_text(content)
def process_chunk(doc, chat_prompt, retries=3, chunk_index=0):
for attempt in range(retries):
result = llm.invoke(chat_prompt.format_prompt(text=doc).to_messages())
edited_result_content = result.content
if 0.95 * len(doc) <= len(edited_result_content) <= 1.05 * len(doc):
return edited_result_content
print(f"Retry {attempt + 1} for chunk due to size mismatch.")
raise ValueError("Failed to process chunk after retries.")
def write_to_temp_file(temp_file_path, docs, chat_prompt):
os.makedirs("/tmp/proof-edits", exist_ok=True)
with open(temp_file_path, "w") as f:
@ -90,12 +92,27 @@ def write_to_temp_file(temp_file_path, docs, chat_prompt):
edited_result_content = process_chunk(doc, chat_prompt, chunk_index=i)
f.write(edited_result_content + "\n")
def analyze_diff(diff_file_path):
diff_content = read_file(diff_file_path)
analysis_prompt = f"""
You are an expert technical editor. Please analyze the following diff and ensure it looks like a successful copy edit of a markdown file. Provide feedback if there are any issues or if it looks good, just reply with the single word: good
Diff:
{diff_content}
"""
result = llm.invoke([HumanMessage(content=analysis_prompt)])
return result.content
def process_file(input_file):
content = read_file(input_file)
docs = split_content(content)
print(f"Split into {len(docs)} docs")
chat_prompt = ChatPromptTemplate.from_messages([system_prompt, human_message_prompt])
chat_prompt = ChatPromptTemplate.from_messages(
[system_prompt, human_message_prompt]
)
os.makedirs("/tmp/proof-edits", exist_ok=True)
temp_output_file = "/tmp/proof-edits/edited_output.md"
@ -109,11 +126,20 @@ def process_file(input_file):
edited_content = f.read()
# Generate and save the diff for the whole file
diff = difflib.unified_diff(original_content.splitlines(), edited_content.splitlines(), lineterm='')
diff = difflib.unified_diff(
original_content.splitlines(), edited_content.splitlines(), lineterm=""
)
with open("/tmp/proof-edits/diff_file.diff", "w") as diff_file:
diff_file.write('\n'.join(diff))
os.replace(temp_output_file, input_file)
print(f"Edited file saved as {input_file}")
diff_file.write("\n".join(diff))
# Analyze the diff
analysis_result = analyze_diff("/tmp/proof-edits/diff_file.diff")
if analysis_result.lower().strip() == "good":
os.replace(temp_output_file, input_file)
print(f"Edited file saved as {input_file}")
else:
print(f"The diff looked suspect. Diff analysis result: {analysis_result}")
if __name__ == "__main__":

View File

@ -2,7 +2,7 @@
This documentation is currently hosted live at [Spiff-Arena's ReadTheDocs](https://spiff-arena.readthedocs.io/en/latest/).
Please set aside a couple of hours to work through this, as getting this setup correctly once is 10,000 times better than having problems every day for the rest of your life.
Please set aside a couple of hours to work through this, as getting this set up correctly once is 10,000 times better than having problems every day for the rest of your life.
## Our Methodology