From 79f8c9164c73d68f1df27f116d718333fdf9dd68 Mon Sep 17 00:00:00 2001 From: burnettk Date: Fri, 28 Jun 2024 19:45:36 -0400 Subject: [PATCH] remove proofing scripts, see no-term-limits --- docs/bin/edit | 16 ---- docs/bin/edit_all | 33 ------- docs/bin/gpt-proofread.py | 176 ---------------------------------- docs/bin/gpt-requirements.txt | 3 - 4 files changed, 228 deletions(-) delete mode 100755 docs/bin/edit delete mode 100755 docs/bin/edit_all delete mode 100644 docs/bin/gpt-proofread.py delete mode 100644 docs/bin/gpt-requirements.txt diff --git a/docs/bin/edit b/docs/bin/edit deleted file mode 100755 index cd03b72d..00000000 --- a/docs/bin/edit +++ /dev/null @@ -1,16 +0,0 @@ -#!/usr/bin/env bash - -function error_handler() { - >&2 echo "Exited with BAD EXIT CODE '${2}' in ${0} script at line: ${1}." - exit "$2" -} -trap 'error_handler ${LINENO} $?' ERR -set -o errtrace -o errexit -o nounset -o pipefail - -# example input -# file="Building_Diagrams/data_stores.md" - -file_to_use="${1:-$file}" - -gitc "$file_to_use" -python bin/gpt-proofread.py "$file_to_use" diff --git a/docs/bin/edit_all b/docs/bin/edit_all deleted file mode 100755 index b06d05a1..00000000 --- a/docs/bin/edit_all +++ /dev/null @@ -1,33 +0,0 @@ -#!/usr/bin/env bash - -function error_handler() { - >&2 echo "Exited with BAD EXIT CODE '${2}' in ${0} script at line: ${1}." - exit "$2" -} -trap 'error_handler ${LINENO} $?' ERR -set -o errtrace -o errexit -o nounset -o pipefail - -# function to update single file -function update_file() { - local file="$1" - if [[ "$file" == "./Support/FAQ.md" ]]; then - echo "skipping $file since it is in a question and answer format that LLMs cannot handle. They assume you are doing few-shot learning and do not return the full doc." - return - fi - markdown_to_ventilated_prose.py "$file" "$file" - ./bin/edit "$file" - markdown_to_ventilated_prose.py "$file" "$file" -} - -while IFS= read -r -d '' file; do - update_file "$file" -done < <(find . -type f -name "*.md" -print0) - -# update_file "Support/Welcome_Messages.md" - -# these are long, problematic files, good for testing. -# not sure why documentation.md likes to get lots of extra newlines added. -# echo 'fyi, running test files, not all files' -# for file in Getting_Started/quick_start.md Support/FAQ.md documentation/documentation.md; do -# update_file "$file" -# done diff --git a/docs/bin/gpt-proofread.py b/docs/bin/gpt-proofread.py deleted file mode 100644 index ab1bf864..00000000 --- a/docs/bin/gpt-proofread.py +++ /dev/null @@ -1,176 +0,0 @@ -# originally from https://mindfulmodeler.substack.com/p/proofreading-an-entire-book-with -# and then modified for our use case. -import sys -import os -import difflib -import os.path -from langchain.prompts import PromptTemplate -from langchain_openai import ChatOpenAI -from langchain.text_splitter import CharacterTextSplitter -from langchain.prompts.chat import ( - ChatPromptTemplate, - SystemMessagePromptTemplate, - HumanMessagePromptTemplate, -) -from langchain.schema import AIMessage, HumanMessage, SystemMessage - -human_template = """ -{text} -""" -human_message_prompt = HumanMessagePromptTemplate.from_template(human_template) - -# system_text = """You are an expert technical editor specializing in business process management documentation written for enterprise software users. You are especially good at cutting clutter. -# -# - Improve grammar and language -# - fix errors -# - cut clutter -# - keep tone and voice -# - don't change markdown syntax, e.g. keep [@reference] -# - never cut jokes -# - output 1 line per sentence (same as input) -# """ - -# style ideas from 24 aug 2023: -# - short and focused -# - clear over fun -# - brief over verbose -# - Do not leave any trailing spaces (handled by another script, though) -# - Never remove entire sentences (didn't seem necessary, since we said keep everything else exactly the same) - -system_text = """You are proofreading a markdown document and you will receive text that is almost exactly correct, but may contain errors. You should: - -- Fix spelling -- Not edit URLs -- Never touch a markdown link; these might look like: [Image label](images/Manual_instructions_panel.png) -- Improve grammar that is obviously wrong -- Fix awkward language if it is really bad -- Keep everything else exactly the same, including tone and voice -- not change the case of words unless they are obviously wrong -- Avoid changing markdown syntax, e.g. keep [@reference] -- Output one line per sentence (same as input) -- Avoid putting multiple sentences on the same line -- Make sure you do not remove any headers at the beginning of the text (markdown headers begin with one or more # characters) - -The markdown document follows. The output document's first line should probably match that of the input document, even if it is a markdown header. -""" - -system_prompt = SystemMessage(content=system_text) - -EDIT_DIR = "/tmp/edits" - -openai_api_key = os.environ.get("OPENAI_API_KEY") -if openai_api_key is None: - keyfile = "oai.key" - with open(keyfile, "r") as f: - openai_api_key = f.read().strip() - -# model = "gpt-4" -model = "gpt-4o" -# model = "gpt-3.5-turbo" - -# If you get timeouts, you might have to increase timeout parameter -llm = ChatOpenAI(openai_api_key=openai_api_key, model=model, request_timeout=240) - - -def read_file(file_path): - with open(file_path, "r") as f: - return f.read() - - -def split_content(content, chunk_size=13000): - splitter = CharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=0) - return splitter.split_text(content) - - -def process_chunk(doc, chat_prompt, retries=3, chunk_index=0): - for attempt in range(retries): - result = llm.invoke(chat_prompt.format_prompt(text=doc).to_messages()) - edited_result_content = result.content - if 0.95 * len(doc) <= len(edited_result_content) <= 1.05 * len(doc): - return edited_result_content - print(f"Retry {attempt + 1} for chunk due to size mismatch.") - raise ValueError("Failed to process chunk after retries.") - - -def get_edited_content(docs, chat_prompt): - edited_content = "" - for i, doc in enumerate(docs): - edited_result_content = process_chunk(doc, chat_prompt, chunk_index=i) - edited_content += edited_result_content + "\n" - return edited_content - - -def analyze_diff(diff_file_path): - diff_content = read_file(diff_file_path) - analysis_prompt = f""" -You are an expert technical editor. -Please analyze the following diff and ensure it looks like a successful copy edit of a markdown file. -Editing URLs is not allowed; never touch a link like [Image label](images/Manual_instructions_panel.png) -It is not a successful edit if line one has been removed (editing is fine; removing is not). -It is not a successful edit if three or more lines in a row have been removed without replacement. -Edits or reformats are potentially good, but simply removing or adding a bunch of content is bad. -Provide feedback if there are any issues. -If it looks good, just reply with the single word: good - -Diff: -{diff_content} -""" - result = llm.invoke([HumanMessage(content=analysis_prompt)]) - return result.content - - -def process_file(input_file): - content = read_file(input_file) - docs = split_content(content) - print(f"Split into {len(docs)} docs") - - chat_prompt = ChatPromptTemplate.from_messages( - [system_prompt, human_message_prompt] - ) - os.makedirs(EDIT_DIR, exist_ok=True) - - # Save the original content for diff generation - original_content = content - - edited_content = get_edited_content(docs, chat_prompt) - temp_output_file = f"{EDIT_DIR}/edited_output.md" - - overall_result = None - if edited_content == original_content: - print(f"{input_file}: No edits made.") - return "no_edits" - - with open(temp_output_file, "w") as f: - f.write(edited_content) - - # Generate and save the diff for the whole file based on the basename of the input file - input_basename = os.path.basename(input_file) - diff_file_path = f"{EDIT_DIR}/{input_basename}.diff" - diff = difflib.unified_diff( - original_content.splitlines(), edited_content.splitlines(), lineterm="" - ) - with open(diff_file_path, "w") as diff_file: - diff_file.write("\n".join(diff)) - - # Analyze the diff - analysis_result = analyze_diff(diff_file_path) - - if analysis_result.lower().strip() == "good": - os.replace(temp_output_file, input_file) - print(f"{input_file}: edited!") - return "edited" - else: - print( - f"{input_file}: The diff looked suspect. Diff analysis result: {analysis_result}" - ) - return "suspect_diff" - - -if __name__ == "__main__": - if len(sys.argv) < 2: - print("Usage: python script.py input_file") - else: - input_file = sys.argv[1] - overall_result = process_file(input_file) - with open(f"{EDIT_DIR}/proofread_results.txt", "a") as f: - f.write(f"{input_file}: {overall_result}\n") diff --git a/docs/bin/gpt-requirements.txt b/docs/bin/gpt-requirements.txt deleted file mode 100644 index 887b1831..00000000 --- a/docs/bin/gpt-requirements.txt +++ /dev/null @@ -1,3 +0,0 @@ -langchain -langchain-openai -openai