remove proofing scripts, see no-term-limits

2024-06-28 19:45:36 -04:00 · 2024-06-28 19:45:36 -04:00 · 79f8c9164c
parent 319568324b
commit 79f8c9164c
4 changed files with 0 additions and 228 deletions
--- a/docs/bin/edit
+++ b/docs/bin/edit
@ -1,16 +0,0 @@
-#!/usr/bin/env bash
-
-function error_handler() {
-  >&2 echo "Exited with BAD EXIT CODE '${2}' in ${0} script at line: ${1}."
-  exit "$2"
-}
-trap 'error_handler ${LINENO} $?' ERR
-set -o errtrace -o errexit -o nounset -o pipefail
-
-# example input
-# file="Building_Diagrams/data_stores.md"
-
-file_to_use="${1:-$file}"
-
-gitc "$file_to_use"
-python bin/gpt-proofread.py "$file_to_use"
--- a/docs/bin/edit_all
+++ b/docs/bin/edit_all
@ -1,33 +0,0 @@
-#!/usr/bin/env bash
-
-function error_handler() {
-  >&2 echo "Exited with BAD EXIT CODE '${2}' in ${0} script at line: ${1}."
-  exit "$2"
-}
-trap 'error_handler ${LINENO} $?' ERR
-set -o errtrace -o errexit -o nounset -o pipefail
-
-# function to update single file
-function update_file() {
-  local file="$1"
-  if [[ "$file" == "./Support/FAQ.md" ]]; then
-    echo "skipping $file since it is in a question and answer format that LLMs cannot handle. They assume you are doing few-shot learning and do not return the full doc."
-    return
-  fi
-  markdown_to_ventilated_prose.py "$file" "$file"
-  ./bin/edit "$file"
-  markdown_to_ventilated_prose.py "$file" "$file"
-}
-
-while IFS= read -r -d '' file; do
-  update_file "$file"
-done < <(find . -type f -name "*.md" -print0)
-
-# update_file "Support/Welcome_Messages.md"
-
-# these are long, problematic files, good for testing.
-# not sure why documentation.md likes to get lots of extra newlines added.
-# echo 'fyi, running test files, not all files'
-# for file in Getting_Started/quick_start.md Support/FAQ.md documentation/documentation.md; do
-#   update_file "$file"
-# done
--- a/docs/bin/gpt-proofread.py
+++ b/docs/bin/gpt-proofread.py
@ -1,176 +0,0 @@
-# originally from https://mindfulmodeler.substack.com/p/proofreading-an-entire-book-with
-# and then modified for our use case.
-import sys
-import os
-import difflib
-import os.path
-from langchain.prompts import PromptTemplate
-from langchain_openai import ChatOpenAI
-from langchain.text_splitter import CharacterTextSplitter
-from langchain.prompts.chat import (
-    ChatPromptTemplate,
-    SystemMessagePromptTemplate,
-    HumanMessagePromptTemplate,
-)
-from langchain.schema import AIMessage, HumanMessage, SystemMessage
-
-human_template = """
-{text}
-"""
-human_message_prompt = HumanMessagePromptTemplate.from_template(human_template)
-
-# system_text = """You are an expert technical editor specializing in business process management documentation written for enterprise software users. You are especially good at cutting clutter.
-#
-# - Improve grammar and language
-# - fix errors
-# - cut clutter
-# - keep tone and voice
-# - don't change markdown syntax, e.g. keep [@reference]
-# - never cut jokes
-# - output 1 line per sentence (same as input)
-# """
-
-# style ideas from 24 aug 2023:
-# - short and focused
-# - clear over fun
-# - brief over verbose
-# - Do not leave any trailing spaces (handled by another script, though)
-# - Never remove entire sentences (didn't seem necessary, since we said keep everything else exactly the same)
-
-system_text = """You are proofreading a markdown document and you will receive text that is almost exactly correct, but may contain errors. You should:
-
- Fix spelling
- Not edit URLs
- Never touch a markdown link; these might look like: [Image label](images/Manual_instructions_panel.png)
- Improve grammar that is obviously wrong
- Fix awkward language if it is really bad
- Keep everything else exactly the same, including tone and voice
- not change the case of words unless they are obviously wrong
- Avoid changing markdown syntax, e.g. keep [@reference]
- Output one line per sentence (same as input)
- Avoid putting multiple sentences on the same line
- Make sure you do not remove any headers at the beginning of the text (markdown headers begin with one or more # characters)
-
-The markdown document follows. The output document's first line should probably match that of the input document, even if it is a markdown header.
-"""
-
-system_prompt = SystemMessage(content=system_text)
-
-EDIT_DIR = "/tmp/edits"
-
-openai_api_key = os.environ.get("OPENAI_API_KEY")
-if openai_api_key is None:
-    keyfile = "oai.key"
-    with open(keyfile, "r") as f:
-        openai_api_key = f.read().strip()
-
-# model = "gpt-4"
-model = "gpt-4o"
-# model = "gpt-3.5-turbo"
-
-# If you get timeouts, you might have to increase timeout parameter
-llm = ChatOpenAI(openai_api_key=openai_api_key, model=model, request_timeout=240)
-
-
-def read_file(file_path):
-    with open(file_path, "r") as f:
-        return f.read()
-
-
-def split_content(content, chunk_size=13000):
-    splitter = CharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=0)
-    return splitter.split_text(content)
-
-
-def process_chunk(doc, chat_prompt, retries=3, chunk_index=0):
-    for attempt in range(retries):
-        result = llm.invoke(chat_prompt.format_prompt(text=doc).to_messages())
-        edited_result_content = result.content
-        if 0.95 * len(doc) <= len(edited_result_content) <= 1.05 * len(doc):
-            return edited_result_content
-        print(f"Retry {attempt + 1} for chunk due to size mismatch.")
-    raise ValueError("Failed to process chunk after retries.")
-
-
-def get_edited_content(docs, chat_prompt):
-    edited_content = ""
-    for i, doc in enumerate(docs):
-        edited_result_content = process_chunk(doc, chat_prompt, chunk_index=i)
-        edited_content += edited_result_content + "\n"
-    return edited_content
-
-
-def analyze_diff(diff_file_path):
-    diff_content = read_file(diff_file_path)
-    analysis_prompt = f"""
-You are an expert technical editor.
-Please analyze the following diff and ensure it looks like a successful copy edit of a markdown file.
-Editing URLs is not allowed; never touch a link like [Image label](images/Manual_instructions_panel.png)
-It is not a successful edit if line one has been removed (editing is fine; removing is not).
-It is not a successful edit if three or more lines in a row have been removed without replacement.
-Edits or reformats are potentially good, but simply removing or adding a bunch of content is bad.
-Provide feedback if there are any issues.
-If it looks good, just reply with the single word: good
-
-Diff:
-{diff_content}
-"""
-    result = llm.invoke([HumanMessage(content=analysis_prompt)])
-    return result.content
-
-
-def process_file(input_file):
-    content = read_file(input_file)
-    docs = split_content(content)
-    print(f"Split into {len(docs)} docs")
-
-    chat_prompt = ChatPromptTemplate.from_messages(
-        [system_prompt, human_message_prompt]
-    )
-    os.makedirs(EDIT_DIR, exist_ok=True)
-
-    # Save the original content for diff generation
-    original_content = content
-
-    edited_content = get_edited_content(docs, chat_prompt)
-    temp_output_file = f"{EDIT_DIR}/edited_output.md"
-
-    overall_result = None
-    if edited_content == original_content:
-        print(f"{input_file}: No edits made.")
-        return "no_edits"
-
-    with open(temp_output_file, "w") as f:
-        f.write(edited_content)
-
-    # Generate and save the diff for the whole file based on the basename of the input file
-    input_basename = os.path.basename(input_file)
-    diff_file_path = f"{EDIT_DIR}/{input_basename}.diff"
-    diff = difflib.unified_diff(
-        original_content.splitlines(), edited_content.splitlines(), lineterm=""
-    )
-    with open(diff_file_path, "w") as diff_file:
-        diff_file.write("\n".join(diff))
-
-    # Analyze the diff
-    analysis_result = analyze_diff(diff_file_path)
-
-    if analysis_result.lower().strip() == "good":
-        os.replace(temp_output_file, input_file)
-        print(f"{input_file}: edited!")
-        return "edited"
-    else:
-        print(
-            f"{input_file}: The diff looked suspect. Diff analysis result: {analysis_result}"
-        )
-        return "suspect_diff"
-
-
-if __name__ == "__main__":
-    if len(sys.argv) < 2:
-        print("Usage: python script.py input_file")
-    else:
-        input_file = sys.argv[1]
-        overall_result = process_file(input_file)
-        with open(f"{EDIT_DIR}/proofread_results.txt", "a") as f:
-            f.write(f"{input_file}: {overall_result}\n")
--- a/docs/bin/gpt-requirements.txt
+++ b/docs/bin/gpt-requirements.txt
@ -1,3 +0,0 @@
-langchain
-langchain-openai
-openai