81 lines
3.1 KiB
Python
81 lines
3.1 KiB
Python
#!/usr/bin/env python3
|
|
|
|
from argparse import ArgumentParser
|
|
import glob
|
|
import os
|
|
|
|
import pandas as pd
|
|
|
|
def parse_args():
|
|
parser = ArgumentParser()
|
|
parser.add_argument("lang", help="language code for directory containing .csv files to merge")
|
|
parser.add_argument("--csv-path", help="path to the dir containing the .csv files")
|
|
parser.add_argument("--data-path", default="data/", help="path to where the merged csv will be written")
|
|
parser.add_argument("--fix-2017-data", action="store_true", help="fix the 2017 data")
|
|
return parser.parse_args()
|
|
|
|
|
|
def main():
|
|
args = parse_args()
|
|
if args.csv_path is None:
|
|
csv_path = os.path.join("csv_to_merge", args.lang)
|
|
else:
|
|
csv_path = args.csv_path
|
|
csv_glob = os.path.join(csv_path, "*.csv")
|
|
csv_files = glob.glob(csv_glob)
|
|
|
|
if args.data_path is None:
|
|
data_path = os.path.join("data", args.lang)
|
|
else:
|
|
data_path = os.path.join(args.data_path, args.lang)
|
|
print(f"Writing merged CSV files to '{data_path}'")
|
|
os.makedirs(data_path, exist_ok=True)
|
|
|
|
|
|
try:
|
|
with open(os.path.join(data_path, f"merged_exc_{args.lang}.csv"), "w", encoding="utf8") as f:
|
|
f.write("source_term\ttranslation\tstring_cat\tplatform\tproduct\tversion\n")
|
|
if args.fix_2017_data:
|
|
# The first file in most languages (NETFramework2.0SP1.csv)
|
|
# has empty translation rows at the beginning,
|
|
# which messes up pandas' parsing.
|
|
#
|
|
# The current solution is very hacky but rewriting
|
|
# by filtering the lines (see below) gave quite bad results.
|
|
# filtered_content = [row for i, row in enumerate(csv.reader(open(csv_files[0], "r", encoding="utf8"))) if i not in range(14, 20)]
|
|
csv_files[0], csv_files[7] = csv_files[7], csv_files[0]
|
|
for i, csv_file in enumerate(csv_files):
|
|
print(f"Reading file {csv_file}… ({i+1}/{len(csv_files)})")
|
|
|
|
try:
|
|
df = pd.read_csv(csv_file, skiprows=13, delimiter=",", encoding="utf-8", on_bad_lines="warn")
|
|
df = df.loc[:, ~df.columns.str.contains("^Unnamed")]
|
|
except pd.errors.EmptyDataError:
|
|
print(f"Empty file '{csv_file}' found, skipping…")
|
|
continue
|
|
|
|
# drop empty columns
|
|
df = df.dropna(axis=1, how="all")
|
|
|
|
# drop lines with no translation
|
|
try:
|
|
df.dropna(subset=[df.columns[1]], how="all", inplace=True)
|
|
# ignore IndexErrors (bad file formatting)
|
|
except (IndexError, pd.errors.EmptyDataError):
|
|
continue
|
|
|
|
merged_path = os.path.join(data_path, f"merged_exc_{args.lang}.csv")
|
|
df.to_csv(merged_path, mode="a", sep="\t", index=False)
|
|
|
|
print(f"Parsed {csv_file} successfully.")
|
|
else:
|
|
if len(csv_files) == 0:
|
|
raise SystemExit(f"No csv files found in '{csv_path}'")
|
|
|
|
except (OSError, IOError) as e:
|
|
print(f"{type(e).__name__}: {e}")
|
|
raise SystemExit(1)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main() |