remove proofing scripts, see no-term-limits
This commit is contained in:
parent
319568324b
commit
79f8c9164c
|
@ -1,16 +0,0 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
function error_handler() {
|
||||
>&2 echo "Exited with BAD EXIT CODE '${2}' in ${0} script at line: ${1}."
|
||||
exit "$2"
|
||||
}
|
||||
trap 'error_handler ${LINENO} $?' ERR
|
||||
set -o errtrace -o errexit -o nounset -o pipefail
|
||||
|
||||
# example input
|
||||
# file="Building_Diagrams/data_stores.md"
|
||||
|
||||
file_to_use="${1:-$file}"
|
||||
|
||||
gitc "$file_to_use"
|
||||
python bin/gpt-proofread.py "$file_to_use"
|
|
@ -1,33 +0,0 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
function error_handler() {
|
||||
>&2 echo "Exited with BAD EXIT CODE '${2}' in ${0} script at line: ${1}."
|
||||
exit "$2"
|
||||
}
|
||||
trap 'error_handler ${LINENO} $?' ERR
|
||||
set -o errtrace -o errexit -o nounset -o pipefail
|
||||
|
||||
# function to update single file
|
||||
function update_file() {
|
||||
local file="$1"
|
||||
if [[ "$file" == "./Support/FAQ.md" ]]; then
|
||||
echo "skipping $file since it is in a question and answer format that LLMs cannot handle. They assume you are doing few-shot learning and do not return the full doc."
|
||||
return
|
||||
fi
|
||||
markdown_to_ventilated_prose.py "$file" "$file"
|
||||
./bin/edit "$file"
|
||||
markdown_to_ventilated_prose.py "$file" "$file"
|
||||
}
|
||||
|
||||
while IFS= read -r -d '' file; do
|
||||
update_file "$file"
|
||||
done < <(find . -type f -name "*.md" -print0)
|
||||
|
||||
# update_file "Support/Welcome_Messages.md"
|
||||
|
||||
# these are long, problematic files, good for testing.
|
||||
# not sure why documentation.md likes to get lots of extra newlines added.
|
||||
# echo 'fyi, running test files, not all files'
|
||||
# for file in Getting_Started/quick_start.md Support/FAQ.md documentation/documentation.md; do
|
||||
# update_file "$file"
|
||||
# done
|
|
@ -1,176 +0,0 @@
|
|||
# originally from https://mindfulmodeler.substack.com/p/proofreading-an-entire-book-with
|
||||
# and then modified for our use case.
|
||||
import sys
|
||||
import os
|
||||
import difflib
|
||||
import os.path
|
||||
from langchain.prompts import PromptTemplate
|
||||
from langchain_openai import ChatOpenAI
|
||||
from langchain.text_splitter import CharacterTextSplitter
|
||||
from langchain.prompts.chat import (
|
||||
ChatPromptTemplate,
|
||||
SystemMessagePromptTemplate,
|
||||
HumanMessagePromptTemplate,
|
||||
)
|
||||
from langchain.schema import AIMessage, HumanMessage, SystemMessage
|
||||
|
||||
human_template = """
|
||||
{text}
|
||||
"""
|
||||
human_message_prompt = HumanMessagePromptTemplate.from_template(human_template)
|
||||
|
||||
# system_text = """You are an expert technical editor specializing in business process management documentation written for enterprise software users. You are especially good at cutting clutter.
|
||||
#
|
||||
# - Improve grammar and language
|
||||
# - fix errors
|
||||
# - cut clutter
|
||||
# - keep tone and voice
|
||||
# - don't change markdown syntax, e.g. keep [@reference]
|
||||
# - never cut jokes
|
||||
# - output 1 line per sentence (same as input)
|
||||
# """
|
||||
|
||||
# style ideas from 24 aug 2023:
|
||||
# - short and focused
|
||||
# - clear over fun
|
||||
# - brief over verbose
|
||||
# - Do not leave any trailing spaces (handled by another script, though)
|
||||
# - Never remove entire sentences (didn't seem necessary, since we said keep everything else exactly the same)
|
||||
|
||||
system_text = """You are proofreading a markdown document and you will receive text that is almost exactly correct, but may contain errors. You should:
|
||||
|
||||
- Fix spelling
|
||||
- Not edit URLs
|
||||
- Never touch a markdown link; these might look like: [Image label](images/Manual_instructions_panel.png)
|
||||
- Improve grammar that is obviously wrong
|
||||
- Fix awkward language if it is really bad
|
||||
- Keep everything else exactly the same, including tone and voice
|
||||
- not change the case of words unless they are obviously wrong
|
||||
- Avoid changing markdown syntax, e.g. keep [@reference]
|
||||
- Output one line per sentence (same as input)
|
||||
- Avoid putting multiple sentences on the same line
|
||||
- Make sure you do not remove any headers at the beginning of the text (markdown headers begin with one or more # characters)
|
||||
|
||||
The markdown document follows. The output document's first line should probably match that of the input document, even if it is a markdown header.
|
||||
"""
|
||||
|
||||
system_prompt = SystemMessage(content=system_text)
|
||||
|
||||
EDIT_DIR = "/tmp/edits"
|
||||
|
||||
openai_api_key = os.environ.get("OPENAI_API_KEY")
|
||||
if openai_api_key is None:
|
||||
keyfile = "oai.key"
|
||||
with open(keyfile, "r") as f:
|
||||
openai_api_key = f.read().strip()
|
||||
|
||||
# model = "gpt-4"
|
||||
model = "gpt-4o"
|
||||
# model = "gpt-3.5-turbo"
|
||||
|
||||
# If you get timeouts, you might have to increase timeout parameter
|
||||
llm = ChatOpenAI(openai_api_key=openai_api_key, model=model, request_timeout=240)
|
||||
|
||||
|
||||
def read_file(file_path):
|
||||
with open(file_path, "r") as f:
|
||||
return f.read()
|
||||
|
||||
|
||||
def split_content(content, chunk_size=13000):
|
||||
splitter = CharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=0)
|
||||
return splitter.split_text(content)
|
||||
|
||||
|
||||
def process_chunk(doc, chat_prompt, retries=3, chunk_index=0):
|
||||
for attempt in range(retries):
|
||||
result = llm.invoke(chat_prompt.format_prompt(text=doc).to_messages())
|
||||
edited_result_content = result.content
|
||||
if 0.95 * len(doc) <= len(edited_result_content) <= 1.05 * len(doc):
|
||||
return edited_result_content
|
||||
print(f"Retry {attempt + 1} for chunk due to size mismatch.")
|
||||
raise ValueError("Failed to process chunk after retries.")
|
||||
|
||||
|
||||
def get_edited_content(docs, chat_prompt):
|
||||
edited_content = ""
|
||||
for i, doc in enumerate(docs):
|
||||
edited_result_content = process_chunk(doc, chat_prompt, chunk_index=i)
|
||||
edited_content += edited_result_content + "\n"
|
||||
return edited_content
|
||||
|
||||
|
||||
def analyze_diff(diff_file_path):
|
||||
diff_content = read_file(diff_file_path)
|
||||
analysis_prompt = f"""
|
||||
You are an expert technical editor.
|
||||
Please analyze the following diff and ensure it looks like a successful copy edit of a markdown file.
|
||||
Editing URLs is not allowed; never touch a link like [Image label](images/Manual_instructions_panel.png)
|
||||
It is not a successful edit if line one has been removed (editing is fine; removing is not).
|
||||
It is not a successful edit if three or more lines in a row have been removed without replacement.
|
||||
Edits or reformats are potentially good, but simply removing or adding a bunch of content is bad.
|
||||
Provide feedback if there are any issues.
|
||||
If it looks good, just reply with the single word: good
|
||||
|
||||
Diff:
|
||||
{diff_content}
|
||||
"""
|
||||
result = llm.invoke([HumanMessage(content=analysis_prompt)])
|
||||
return result.content
|
||||
|
||||
|
||||
def process_file(input_file):
|
||||
content = read_file(input_file)
|
||||
docs = split_content(content)
|
||||
print(f"Split into {len(docs)} docs")
|
||||
|
||||
chat_prompt = ChatPromptTemplate.from_messages(
|
||||
[system_prompt, human_message_prompt]
|
||||
)
|
||||
os.makedirs(EDIT_DIR, exist_ok=True)
|
||||
|
||||
# Save the original content for diff generation
|
||||
original_content = content
|
||||
|
||||
edited_content = get_edited_content(docs, chat_prompt)
|
||||
temp_output_file = f"{EDIT_DIR}/edited_output.md"
|
||||
|
||||
overall_result = None
|
||||
if edited_content == original_content:
|
||||
print(f"{input_file}: No edits made.")
|
||||
return "no_edits"
|
||||
|
||||
with open(temp_output_file, "w") as f:
|
||||
f.write(edited_content)
|
||||
|
||||
# Generate and save the diff for the whole file based on the basename of the input file
|
||||
input_basename = os.path.basename(input_file)
|
||||
diff_file_path = f"{EDIT_DIR}/{input_basename}.diff"
|
||||
diff = difflib.unified_diff(
|
||||
original_content.splitlines(), edited_content.splitlines(), lineterm=""
|
||||
)
|
||||
with open(diff_file_path, "w") as diff_file:
|
||||
diff_file.write("\n".join(diff))
|
||||
|
||||
# Analyze the diff
|
||||
analysis_result = analyze_diff(diff_file_path)
|
||||
|
||||
if analysis_result.lower().strip() == "good":
|
||||
os.replace(temp_output_file, input_file)
|
||||
print(f"{input_file}: edited!")
|
||||
return "edited"
|
||||
else:
|
||||
print(
|
||||
f"{input_file}: The diff looked suspect. Diff analysis result: {analysis_result}"
|
||||
)
|
||||
return "suspect_diff"
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
if len(sys.argv) < 2:
|
||||
print("Usage: python script.py input_file")
|
||||
else:
|
||||
input_file = sys.argv[1]
|
||||
overall_result = process_file(input_file)
|
||||
with open(f"{EDIT_DIR}/proofread_results.txt", "a") as f:
|
||||
f.write(f"{input_file}: {overall_result}\n")
|
|
@ -1,3 +0,0 @@
|
|||
langchain
|
||||
langchain-openai
|
||||
openai
|
Loading…
Reference in New Issue