Compare commits

...

3 Commits

Author SHA1 Message Date
burnettk c088d8884e
update script 2024-06-28 08:36:39 -04:00
burnettk 5ed9263f4e
update script 2024-06-28 08:32:13 -04:00
burnettk d484633e17
update script 2024-06-28 08:26:17 -04:00
1 changed files with 18 additions and 7 deletions

View File

@ -3,6 +3,7 @@
import sys import sys
import os import os
import difflib import difflib
import os.path
from langchain.prompts import PromptTemplate from langchain.prompts import PromptTemplate
from langchain_openai import ChatOpenAI from langchain_openai import ChatOpenAI
from langchain.text_splitter import CharacterTextSplitter from langchain.text_splitter import CharacterTextSplitter
@ -36,9 +37,11 @@ human_message_prompt = HumanMessagePromptTemplate.from_template(human_template)
# - Do not leave any trailing spaces (handled by another script, though) # - Do not leave any trailing spaces (handled by another script, though)
# - Never remove entire sentences (didn't seem necessary, since we said keep everything else exactly the same) # - Never remove entire sentences (didn't seem necessary, since we said keep everything else exactly the same)
system_text = """You are proofreading and you will receive text that is almost exactly correct, but may contain errors. You should: system_text = """You are proofreading a markdown document and you will receive text that is almost exactly correct, but may contain errors. You should:
- Fix spelling - Fix spelling
- Not edit URLs
- Never touch a markdown link; these might look like: [Image label](images/Manual_instructions_panel.png)
- Improve grammar that is obviously wrong - Improve grammar that is obviously wrong
- Fix awkward language if it is really bad - Fix awkward language if it is really bad
- Keep everything else exactly the same, including tone and voice - Keep everything else exactly the same, including tone and voice
@ -47,10 +50,14 @@ system_text = """You are proofreading and you will receive text that is almost e
- Output one line per sentence (same as input) - Output one line per sentence (same as input)
- Avoid putting multiple sentences on the same line - Avoid putting multiple sentences on the same line
- Make sure you do not remove any headers at the beginning of the text (markdown headers begin with one or more # characters) - Make sure you do not remove any headers at the beginning of the text (markdown headers begin with one or more # characters)
The markdown document follows. The output document's first line should probably match that of the input document, even if it is a markdown header.
""" """
system_prompt = SystemMessage(content=system_text) system_prompt = SystemMessage(content=system_text)
EDIT_DIR = "/tmp/edits"
openai_api_key = os.environ.get("OPENAI_API_KEY") openai_api_key = os.environ.get("OPENAI_API_KEY")
if openai_api_key is None: if openai_api_key is None:
keyfile = "oai.key" keyfile = "oai.key"
@ -98,6 +105,8 @@ def analyze_diff(diff_file_path):
analysis_prompt = f""" analysis_prompt = f"""
You are an expert technical editor. You are an expert technical editor.
Please analyze the following diff and ensure it looks like a successful copy edit of a markdown file. Please analyze the following diff and ensure it looks like a successful copy edit of a markdown file.
Editing URLs is not allowed; never touch a link like [Image label](images/Manual_instructions_panel.png)
It is not a successful edit if line one has been removed (editing is fine; removing is not).
It is not a successful edit if three or more lines in a row have been removed without replacement. It is not a successful edit if three or more lines in a row have been removed without replacement.
Edits or reformats are potentially good, but simply removing or adding a bunch of content is bad. Edits or reformats are potentially good, but simply removing or adding a bunch of content is bad.
Provide feedback if there are any issues. Provide feedback if there are any issues.
@ -118,13 +127,13 @@ def process_file(input_file):
chat_prompt = ChatPromptTemplate.from_messages( chat_prompt = ChatPromptTemplate.from_messages(
[system_prompt, human_message_prompt] [system_prompt, human_message_prompt]
) )
os.makedirs("/tmp/proof-edits", exist_ok=True) os.makedirs(EDIT_DIR, exist_ok=True)
# Save the original content for diff generation # Save the original content for diff generation
original_content = content original_content = content
edited_content = get_edited_content(docs, chat_prompt) edited_content = get_edited_content(docs, chat_prompt)
temp_output_file = "/tmp/proof-edits/edited_output.md" temp_output_file = f"{EDIT_DIR}/edited_output.md"
overall_result = None overall_result = None
if edited_content == original_content: if edited_content == original_content:
@ -134,15 +143,17 @@ def process_file(input_file):
with open(temp_output_file, "w") as f: with open(temp_output_file, "w") as f:
f.write(edited_content) f.write(edited_content)
# Generate and save the diff for the whole file # Generate and save the diff for the whole file based on the basename of the input file
input_basename = os.path.basename(input_file)
diff_file_path = f"{EDIT_DIR}/{input_basename}.diff"
diff = difflib.unified_diff( diff = difflib.unified_diff(
original_content.splitlines(), edited_content.splitlines(), lineterm="" original_content.splitlines(), edited_content.splitlines(), lineterm=""
) )
with open("/tmp/proof-edits/diff_file.diff", "w") as diff_file: with open(diff_file_path, "w") as diff_file:
diff_file.write("\n".join(diff)) diff_file.write("\n".join(diff))
# Analyze the diff # Analyze the diff
analysis_result = analyze_diff("/tmp/proof-edits/diff_file.diff") analysis_result = analyze_diff(diff_file_path)
if analysis_result.lower().strip() == "good": if analysis_result.lower().strip() == "good":
os.replace(temp_output_file, input_file) os.replace(temp_output_file, input_file)
@ -161,5 +172,5 @@ if __name__ == "__main__":
else: else:
input_file = sys.argv[1] input_file = sys.argv[1]
overall_result = process_file(input_file) overall_result = process_file(input_file)
with open("/tmp/proof-edits/proofread_results.txt", "a") as f: with open(f"{EDIT_DIR}/proofread_results.txt", "a") as f:
f.write(f"{input_file}: {overall_result}\n") f.write(f"{input_file}: {overall_result}\n")