הצעת ייעול | הוספת מילון ראשי תיבות

import requests
import json
import time
import re
from google.colab import files

# --- הגדרות ---
API_URL = "https://nakdan-u1-0.loadbalancer.dicta.org.il/api"
CHUNK_SIZE = 1500
LOG_FILE = "debug_log.txt"
JSON_FILE = "abbreviations_result.json"

# משתנים גלובליים
logs = []
all_potential_misses = set() # מאגר לכל המילים שלא פוענחו

def log(message):
    timestamp = time.strftime("%H:%M:%S")
    full_msg = f"[{timestamp}] {message}"
    print(full_msg)
    logs.append(full_msg)

def smart_chunking(text, max_size):
    """חלוקה חכמה לפי סופי שורות"""
    chunks = []
    current_chunk = []
    current_length = 0
    lines = text.split('\n')
    for line in lines:
        line_len = len(line) + 1
        if current_length + line_len > max_size:
            if current_chunk:
                chunks.append("\n".join(current_chunk))
                current_chunk = []
                current_length = 0
            if line_len > max_size:
                # טיפול בשורה ארוכה מאוד
                words = line.split(' ')
                temp_chunk = []
                temp_len = 0
                for word in words:
                    if temp_len + len(word) + 1 > max_size:
                        chunks.append(" ".join(temp_chunk))
                        temp_chunk = []
                        temp_len = 0
                    temp_chunk.append(word)
                    temp_len += len(word) + 1
                if temp_chunk:
                    current_chunk = temp_chunk
                    current_length = temp_len
            else:
                current_chunk.append(line)
                current_length += line_len
        else:
            current_chunk.append(line)
            current_length += line_len
    if current_chunk:
        chunks.append("\n".join(current_chunk))
    return chunks

def call_api(data_payload):
    """פונקציה גנרית לקריאה ל-API"""
    headers = {"Content-Type": "application/json;charset=UTF-8"}
    try:
        response = requests.post(API_URL, json=data_payload, headers=headers, timeout=45)
        if response.status_code == 200:
            return response.json()
    except:
        pass
    return None

def strip_prefixes(word):
    """
    מנסה להסיר אותיות שימוש (מש"ה וכל"ב) מתחילת המילה
    מחזיר את המילה הנקייה אם נשאר בה גרשיים, אחרת מחזיר None
    """
    # מסיר אותיות מ/ש/ה/ו/כ/ל/ב/ד מתחילת המילה, רק אם יש אחריהן עוד גרשיים
    # למשל: ב"המק -> המק | ו"יוסף -> יוסף
    clean = re.sub(r'^[משהוכלבד]+', '', word)
    
    # אם נשארנו עם מילה שיש בה גרשיים והיא לפחות 2 תווים
    if '"' in clean and len(clean) >= 2:
        return clean
    return word # אם אי אפשר לנקות, מחזיר את המקור

def retry_missed_words(missed_words_list, existing_results):
    """
    מנגנון הזדמנות שנייה: לוקח מילים שנכשלו, מנקה אותיות שימוש ושולח שוב
    """
    if not missed_words_list:
        return existing_results
    
    log(f"🔄 מתחיל סבב ב' (Retry) עבור {len(missed_words_list)} מילים שלא פוענחו...")
    
    # מיפוי בין המילה הנקייה למילה המקורית
    # דוגמה: {'המק': 'ב"המק'}
    clean_to_original = {}
    batch_text = []
    
    for word in missed_words_list:
        clean_word = strip_prefixes(word)
        if clean_word != word: # רק אם היה שינוי
            clean_to_original[clean_word] = word
            batch_text.append(clean_word)
        else:
            # מנסים לשלוח גם את המקור שוב, אולי כבודד יצליח
            batch_text.append(word)
            clean_to_original[word] = word

    # שולחים במנות של 500 מילים כדי לא להעמיס
    chunk_size = 500
    new_found_count = 0
    
    for i in range(0, len(batch_text), chunk_size):
        batch = batch_text[i:i+chunk_size]
        text_string = " ".join(batch) # שולחים כרשימת מילים
        
        payload = {
            "task": "abbrexp",
            "data": text_string,
            "useTokenization": True,
            "genre": "rabbinic"
        }
        
        data = call_api(payload)
        
        if data and 'data' in data:
            for item in data['data']:
                if 'abbreviation' in item and item['abbreviation']:
                    abbr = item['abbreviation']
                    found_clean_word = abbr.get('word')
                    options = abbr.get('options')
                    
                    if found_clean_word and options:
                        # אנחנו צריכים למצוא מה הייתה המילה המקורית (עם האותיות שימוש)
                        # ה-API מחזיר את המילה שהוא מצא (למשל "המק")
                        # אנחנו צריכים לשמור את התוצאה תחת המפתח המקורי "ב"המק"
                        
                        # חיפוש הפוך פשוט (יתכן פספוס קטן אם יש כפילויות, אבל זניח)
                        original_word = clean_to_original.get(found_clean_word)
                        
                        if original_word:
                            existing_results[original_word] = options
                            new_found_count += 1

        time.sleep(0.2)
        
    log(f"✅ סבב ב' הסתיים: הוצלו עוד {new_found_count} ראשי תיבות!")
    return existing_results

# --- גוף התוכנית ---

print("אנא בחר קובץ טקסט...")
uploaded = files.upload()
if uploaded:
    input_filename = next(iter(uploaded))
    log(f"הקובץ {input_filename} נטען.")

    with open(input_filename, 'r', encoding='utf-8') as f:
        full_text = f.read()

    chunks = smart_chunking(full_text, CHUNK_SIZE)
    log(f"מתחיל עיבוד ב-{len(chunks)} מקטעים...")

    final_results = {}
    
    # שלב 1: מעבר ראשי
    for i, chunk in enumerate(chunks):
        # זיהוי פוטנציאלי במקטע (מילים עם גרשיים)
        words_in_chunk = set(re.findall(r'\b[א-ת]+"[א-ת]+\b', chunk))
        
        payload = {
            "task": "abbrexp",
            "data": chunk,
            "useTokenization": True,
            "genre": "rabbinic"
        }
        
        data = call_api(payload)
        
        found_in_chunk = set()
        
        if data and 'data' in data:
            for item in data['data']:
                if 'abbreviation' in item and item['abbreviation']:
                    abbr = item['abbreviation']
                    word = abbr.get('word')
                    options = abbr.get('options')
                    if word and options:
                        final_results[word] = options
                        found_in_chunk.add(word)
        
        # חישוב מה התפספס במקטע הזה
        missed = words_in_chunk - found_in_chunk
        all_potential_misses.update(missed)
        
        print(f"\rעיבוד: {int((i+1)/len(chunks)*100)}% (נמצאו: {len(found_in_chunk)}, חשודים כפספוס: {len(missed)})", end="")
        time.sleep(0.1)

    print("\nסיימנו סבב ראשון.")
    
    # שלב 2: ניסיון הצלה
    # מסננים מילים שכבר נמצאו (למקרה שהן הופיעו במקטע אחר וכן זוהו)
    really_missed = [w for w in all_potential_misses if w not in final_results]
    
    if really_missed:
        final_results = retry_missed_words(really_missed, final_results)

    # שמירה
    log(f"סה\"כ ראשי תיבות מפוענחים: {len(final_results)}")
    
    with open(JSON_FILE, 'w', encoding='utf-8') as f:
        json.dump(final_results, f, ensure_ascii=False, indent=4)
        
    with open(LOG_FILE, 'w', encoding='utf-8') as f:
        f.write("\n".join(logs))

    files.download(JSON_FILE)
    files.download(LOG_FILE)

והנה התוצאה
ראשי תיבות.json
מקווה שיצרתי את ה JSON במבנה הנכון....

פלמנמוני

@מיכאלוש כתב בהצעת ייעול | הוספת מילון ראשי תיבות:

מקווה שיצרתי את ה JSON במבנה הנכון....

תקין, וטופל

פורום אוצריא

הצעת ייעול | הוספת מילון ראשי תיבות