Open datadu-de opened 2 years ago
import databricks.koalas as ks
import os
# Define a function to read and add input_file_name
def read_files_with_input_file_name(path):
files = os.listdir(path)
dfs = []
for file in files:
if file.endswith(".csv"): # Change the file extension based on your file format
df = ks.read_csv(os.path.join(path, file))
df["input_file_name"] = file
dfs.append(df)
combined_df = ks.concat(dfs)
return combined_df
# Specify the folder containing your files
folder_path = "/path/to/your/files"
# Read the files and add input_file_name
result_df = read_files_with_input_file_name(folder_path)
# Now you have a DataFrame with an "input_file_name" column
print(result_df)
when reading a large folder of files with koalas it would be great to be able to add a column with input_file_name, like we can do with pyspark. input_file_name does not work to add the name after the folder is read.