poprawione wyciaganie call_id

tills

Jan 8th, 2025

142

Never

Add comment

Not a member of Pastebin yet? Sign Up, it unlocks many cool features!

Python 3.44 KB | None | 0 0

raw download clone embed print report

def extract_call_segments(full_id):
"""
Extracts the first three underscore-separated segments from a call ID.
For example:
Input: "0240828_31657_2958059406I0L412_20240828_084737_123427680"
Output: "0240828_31657_2958059406I0L412"
Args:
full_id (str): The complete call ID string
Returns:
str: The first three segments joined by underscores
"""
# Split the string by underscores
segments = full_id.split("_")
# Take only the first three segments and join them back with underscores
return "_".join(segments[:3])
def process_transcription(json_content):
"""
Konwertuje transkrypcję z formatu JSON na sformatowany tekst
z oznaczeniami mówiącego i czasem. Dodatkowo wyodrębnia call_id
ze ścieżki źródłowej.
"""
# Parsowanie JSONa
data = (
json.loads(json_content)
if isinstance(json_content, str)
else json_content
)
# Wyodrębnienie call_id ze ścieżki źródłowej
source_url = data.get("source", "")
call_id = ""
if "speechstudiofilename=" in source_url:
# Znajdujemy część URL po "speechstudiofilename="
full_id = source_url.split("speechstudiofilename=")[-1]
# Usuwamy rozszerzenie .wav jeśli występuje
full_id = full_id.replace(".wav", "")
# Wyodrębniamy tylko pierwsze trzy segmenty
call_id = extract_call_segments(full_id)
phrases = []
for phrase in data.get("recognizedPhrases", []):
channel = phrase.get("channel", 0)
speaker = "Agent" if channel == 1 else "Klient"
offset_ticks = float(phrase.get("offsetInTicks", 0))
timestamp = convert_ticks_to_timestamp(offset_ticks)
if phrase.get("nBest") and phrase["nBest"][0].get("display"):
text = phrase["nBest"][0]["display"].strip()
if text and text != "Napisy stworzone przez społeczność Amara.org":
phrases.append((offset_ticks, f"{speaker} {timestamp} {text}"))
phrases.sort(key=lambda x: x[0])
formatted_text = "\n".join(phrase[1] for phrase in phrases)
return formatted_text, call_id
def create_whisper_dataframe(transcriptions_dir):
"""
Tworzy DataFrame z transkrypcjami ze wszystkich plików JSON w podanym katalogu,
dodając kolumnę call_id
"""
data = []
json_files = sorted(Path(transcriptions_dir).glob("*.json"))
print(f"Znaleziono {len(json_files)} plików JSON do przetworzenia.")
for json_path in json_files:
try:
with open(json_path, "r", encoding="utf-8") as f:
json_content = f.read()
# Otrzymujemy zarówno transkrypcję jak i call_id
transcription, call_id = process_transcription(json_content)
data.append(
{
"nazwa_pliku": json_path.name,
"whisper_v2_large_transkrypcja": transcription,
"call_id": call_id, # Dodajemy nową kolumnę
}
)
print(f"Przetworzono plik: {json_path.name}")
except Exception as e:
print(
f"Błąd podczas przetwarzania pliku {json_path.name}: {str(e)}"
)
df_whisper_v2_large = pd.DataFrame(data)
print(
f"\nUtworzono DataFrame z {len(df_whisper_v2_large)} transkrypcjami."
)
return df_whisper_v2_large

Add Comment

Please, Sign In to add comment