# originally from https://mindfulmodeler.substack.com/p/proofreading-an-entire-book-with # and then modified for our use case. import sys import os import difflib import os.path from langchain.prompts import PromptTemplate from langchain_openai import ChatOpenAI from langchain.text_splitter import CharacterTextSplitter from langchain.prompts.chat import ( ChatPromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate, ) from langchain.schema import AIMessage, HumanMessage, SystemMessage human_template = """ {text} """ human_message_prompt = HumanMessagePromptTemplate.from_template(human_template) # system_text = """You are an expert technical editor specializing in business process management documentation written for enterprise software users. You are especially good at cutting clutter. # # - Improve grammar and language # - fix errors # - cut clutter # - keep tone and voice # - don't change markdown syntax, e.g. keep [@reference] # - never cut jokes # - output 1 line per sentence (same as input) # """ # style ideas from 24 aug 2023: # - short and focused # - clear over fun # - brief over verbose # - Do not leave any trailing spaces (handled by another script, though) # - Never remove entire sentences (didn't seem necessary, since we said keep everything else exactly the same) system_text = """You are proofreading a markdown document and you will receive text that is almost exactly correct, but may contain errors. You should: - Fix spelling - Not edit URLs - Never touch a markdown link; these might look like: [Image label](images/Manual_instructions_panel.png) - Improve grammar that is obviously wrong - Fix awkward language if it is really bad - Keep everything else exactly the same, including tone and voice - not change the case of words unless they are obviously wrong - Avoid changing markdown syntax, e.g. keep [@reference] - Output one line per sentence (same as input) - Avoid putting multiple sentences on the same line - Make sure you do not remove any headers at the beginning of the text (markdown headers begin with one or more # characters) The markdown document follows. The output document's first line should probably match that of the input document, even if it is a markdown header. """ system_prompt = SystemMessage(content=system_text) EDIT_DIR = "/tmp/edits" openai_api_key = os.environ.get("OPENAI_API_KEY") if openai_api_key is None: keyfile = "oai.key" with open(keyfile, "r") as f: openai_api_key = f.read().strip() # model = "gpt-4" model = "gpt-4o" # model = "gpt-3.5-turbo" # If you get timeouts, you might have to increase timeout parameter llm = ChatOpenAI(openai_api_key=openai_api_key, model=model, request_timeout=240) def read_file(file_path): with open(file_path, "r") as f: return f.read() def split_content(content, chunk_size=13000): splitter = CharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=0) return splitter.split_text(content) def process_chunk(doc, chat_prompt, retries=3, chunk_index=0): for attempt in range(retries): result = llm.invoke(chat_prompt.format_prompt(text=doc).to_messages()) edited_result_content = result.content if 0.95 * len(doc) <= len(edited_result_content) <= 1.05 * len(doc): return edited_result_content print(f"Retry {attempt + 1} for chunk due to size mismatch.") raise ValueError("Failed to process chunk after retries.") def get_edited_content(docs, chat_prompt): edited_content = "" for i, doc in enumerate(docs): edited_result_content = process_chunk(doc, chat_prompt, chunk_index=i) edited_content += edited_result_content + "\n" return edited_content def analyze_diff(diff_file_path): diff_content = read_file(diff_file_path) analysis_prompt = f""" You are an expert technical editor. Please analyze the following diff and ensure it looks like a successful copy edit of a markdown file. Editing URLs is not allowed; never touch a link like [Image label](images/Manual_instructions_panel.png) It is not a successful edit if line one has been removed (editing is fine; removing is not). It is not a successful edit if three or more lines in a row have been removed without replacement. Edits or reformats are potentially good, but simply removing or adding a bunch of content is bad. Provide feedback if there are any issues. If it looks good, just reply with the single word: good Diff: {diff_content} """ result = llm.invoke([HumanMessage(content=analysis_prompt)]) return result.content def process_file(input_file): content = read_file(input_file) docs = split_content(content) print(f"Split into {len(docs)} docs") chat_prompt = ChatPromptTemplate.from_messages( [system_prompt, human_message_prompt] ) os.makedirs(EDIT_DIR, exist_ok=True) # Save the original content for diff generation original_content = content edited_content = get_edited_content(docs, chat_prompt) temp_output_file = f"{EDIT_DIR}/edited_output.md" overall_result = None if edited_content == original_content: print(f"{input_file}: No edits made.") return "no_edits" with open(temp_output_file, "w") as f: f.write(edited_content) # Generate and save the diff for the whole file based on the basename of the input file input_basename = os.path.basename(input_file) diff_file_path = f"{EDIT_DIR}/{input_basename}.diff" diff = difflib.unified_diff( original_content.splitlines(), edited_content.splitlines(), lineterm="" ) with open(diff_file_path, "w") as diff_file: diff_file.write("\n".join(diff)) # Analyze the diff analysis_result = analyze_diff(diff_file_path) if analysis_result.lower().strip() == "good": os.replace(temp_output_file, input_file) print(f"{input_file}: edited!") return "edited" else: print( f"{input_file}: The diff looked suspect. Diff analysis result: {analysis_result}" ) return "suspect_diff" if __name__ == "__main__": if len(sys.argv) < 2: print("Usage: python script.py input_file") else: input_file = sys.argv[1] overall_result = process_file(input_file) with open(f"{EDIT_DIR}/proofread_results.txt", "a") as f: f.write(f"{input_file}: {overall_result}\n")