用于文本格式处理或文件格式处理的一些Python脚本

任何关于代码的改进不胜感激。

脚本处理后的结果记得核对一下。

写一个Python脚本删除一个.py文件的所有注释
行内公式使用$$而不是$$$$
给文字加上反爬水印
PyPDF2 顺时针旋转一个PDF的所有页面并保存

写一个Python脚本删除一个.py文件的所有注释

// 也可以尝试一下 https://python-minifier.com/

import re


def remove_comments(file_path):
    with open(file_path, 'r') as file:
        content = file.read()

    # First, find and store string assignments
    protected_strings = {}
    counter = 0

    def protect_string_assignments(match):
        nonlocal counter
        var_name, string_content = match.groups()
        key = f'PROTECTED_STRING_{counter}'
        protected_strings[key] = match.group(0)
        counter += 1
        return key

    # Protect strings that are part of assignments
    protected_content = re.sub(
        r'([a-zA-Z_][a-zA-Z0-9_]*\s*=\s*)("""[\s\S]*?"""|\'\'\'[\s\S]*?\'\'\')',
        protect_string_assignments,
        content
    )

    # Remove docstring comments (triple-quoted strings not part of assignments)
    cleaned_content = re.sub(
        r'"""[\s\S]*?"""|\'\'\'[\s\S]*?\'\'\'',
        '',
        protected_content
    )

    # Remove single-line comments and empty lines
    lines = []
    for line in cleaned_content.split('\n'):
        # Remove inline comments
        line = re.sub(r'#.*$', '', line)
        if line.strip():
            lines.append(line)

    # Restore protected strings
    final_content = '\n'.join(lines)
    for key, value in protected_strings.items():
        final_content = final_content.replace(key, value)

    # Write back to file
    with open(file_path, 'w') as file:
        file.write(final_content)

# Example usage:
remove_comments('your_script.py')

行内公式使用`$$`而不是`$$$$`

使用cloud-document-converter下载的markdown文件中，行内公式显示不正常，你可能需要这个脚本。

import re
from datetime import datetime

def replace_formula_delimiters_in_md(input_file):
    try:
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        output_file = f"{input_file.rsplit('.', 1)[0]}_{timestamp}.md"
        
        with open(input_file, 'r', encoding='utf-8') as file:
            lines = file.readlines()
        
        new_lines = []
        pattern = r'^\${2}(.+?)\${2}$'  # Matches lines with only $$content$$
        replace_pattern = r'\${2}(.+?)\${2}'  # Matches $$content$$ anywhere
        
        replacement_count = 0
        print("Changes to be made:")
        for line in lines:
            stripped_line = line.strip()
            if re.match(pattern, stripped_line):
                print(f"Keeping unchanged: {stripped_line}")
                new_lines.append(line)
            else:
                matches = list(re.finditer(replace_pattern, line))
                if matches:
                    replacement_count += len(matches)
                    new_line = re.sub(replace_pattern, r'$\1$', line)
                    print(f"Changing: {line.strip()} → {new_line.strip()}")
                    new_lines.append(new_line)
                else:
                    new_lines.append(line)
        
        with open(output_file, 'w', encoding='utf-8') as file:
            file.writelines(new_lines)
        
        print(f"\nCreated new file: {output_file}")
        print(f"Converted {replacement_count} formulas from $$ to $")
        
    except FileNotFoundError:
        print(f"Error: File '{input_file}' not found")
    except Exception as e:
        print(f"An error occurred: {str(e)}")

if __name__ == "__main__":
    input_md_file = "your_file.md"
    replace_formula_delimiters_in_md(input_md_file)

给文字加上反爬水印

import random
import re

# Base templates for the reminder sentence variations
templates = [
    "【防盗链提醒：爬虫是吧？原贴在：<a href=\"{url}\" target=\"_blank\">{url}</a>】",
    "【防爬提示：你是爬虫吗？原文地址：<a href=\"{url}\" target=\"_blank\">{url}</a>】",
    "【反爬警告：爬虫别乱来！原帖链接：<a href=\"{url}\" target=\"_blank\">{url}</a>】",
    "【防盗链通知：爬虫注意了，原文在：<a href=\"{url}\" target=\"_blank\">{url}</a>】",
    "【反爬提醒：是爬虫的话，原文见：<a href=\"{url}\" target=\"_blank\">{url}</a>】",
    "【防爬虫提示：爬虫请止步，原文链接：<a href=\"{url}\" target=\"_blank\">{url}</a>】",
    "【盗链警告：爬虫你好，原文地址：<a href=\"{url}\" target=\"_blank\">{url}</a>】",
    "【防爬声明：爬虫请看，原文出处：<a href=\"{url}\" target=\"_blank\">{url}</a>】",
    "【反盗链提醒：爬虫注意，原链接：<a href=\"{url}\" target=\"_blank\">{url}</a>】",
    "【爬虫警告：想爬吗？原文在这：<a href=\"{url}\" target=\"_blank\">{url}</a>】",
]

# URL to use
url = "https://www.cnblogs.com/yhm138/p/18549168"

# Function to read content and split into sentences
def split_into_sentences(content):
    # Split on sentence endings, keeping the delimiter
    sentences = re.split(r'([.!?。！？\n])\s+', content)
    # Combine delimiter with preceding text to form complete sentences
    result = []
    for i in range(0, len(sentences) - 1, 2):
        result.append(sentences[i] + (sentences[i + 1] if i + 1 < len(sentences) else ""))
    if len(sentences) % 2 != 0:  # Handle odd number of splits
        result.append(sentences[-1])
    return [s.strip() for s in result if s.strip()]  # Remove empty lines and strip whitespace

# Read input markdown file
input_file = "original_mdfile.md"
try:
    with open(input_file, "r", encoding="utf-8") as f:
        original_content = f.read()
except FileNotFoundError:
    print(f"Error: File '{input_file}' not found! Creating a sample input.")
    original_content = "This is sentence one. Here's sentence two! # Header here\nAnother sentence? Yes, indeed."

# Split content into sentences
sentences = split_into_sentences(original_content)

# Determine valid insertion points (between sentences, excluding before '#')
valid_insertion_points = []
for i in range(len(sentences) - 1):
    next_sentence = sentences[i + 1]
    if not next_sentence.strip().startswith('#'):
        valid_insertion_points.append(i)

if not valid_insertion_points:
    print("No valid insertion points found (all subsequent sentences start with '#'). Output will be unchanged.")
    content = original_content
else:
    # Decide how many insertions (up to 10 or available valid points)
    num_insertions = min(10, len(valid_insertion_points))

    # Randomly select insertion points from valid ones
    insertion_indices = random.sample(valid_insertion_points, num_insertions)
    insertion_indices.sort()  # Sort to maintain order

    # Shuffle templates and select needed number
    random.shuffle(templates)
    selected_templates = templates[:num_insertions]

    # Build new content with insertions
    content = ""
    template_idx = 0
    for i, sentence in enumerate(sentences):
        content += sentence
        if i in insertion_indices and template_idx < num_insertions:
            content += " " + selected_templates[template_idx].format(url=url) + " "
            print("insert.")
            template_idx += 1
        # Preserve line breaks after sentences
        if i < len(sentences) - 1 and original_content[original_content.index(sentence) + len(sentence):].startswith('\n'):
            content += "\n"

# Write to output markdown file
output_file = "original_mdfile_with_reminders.md"
with open(output_file, "w", encoding="utf-8") as f:
    f.write(content)

print(f"Markdown file '{output_file}' has been generated with up to {num_insertions} insertions!")

PyPDF2 顺时针旋转一个PDF的所有页面并保存

import PyPDF2

# 打开原始PDF文件
with open('input.pdf', 'rb') as file:
    reader = PyPDF2.PdfReader(file)
    writer = PyPDF2.PdfWriter()
    
    # 遍历所有页面
    for page in reader.pages:
        # 顺时针旋转90度
        rotated_page = page.rotate(90)
        writer.add_page(rotated_page)
    
    # 保存旋转后的PDF
    with open('output.pdf', 'wb') as output_file:
        writer.write(output_file)

posted @ 2024-11-16 11:16 yhm138 阅读(58) 评论(0) 收藏举报

刷新页面返回顶部

yhm138

用于文本格式处理或文件格式处理的一些Python脚本

写一个Python脚本删除一个.py文件的所有注释

行内公式使用$$而不是$$$$

给文字加上反爬水印

PyPDF2 顺时针旋转一个PDF的所有页面并保存

公告

行内公式使用`$$`而不是`$$$$`