Closed dinjojo closed 4 months ago
Thanks for highlighting this issue - just couple of basic checks:
1) Have you uploaded CAMS detailed statement (not summary) ..? if so, can you please try uploading only the current year statement.
2) Is there way to recreate this problem, I know its a sensitive document and for sure I don't want to see your statement.
Yes I'm using detailed statement let me try with smaller window and let me try with a sample cams statement
I tried with this financial year it didn't worked. One thing i noticed was even after entering password it kept saying enter password so then i removed password from pdf but still it created 1 KB file.
Can you try with latest CAMS Statement seems like they have added a summary section at the top now which would be reason why its failing @SudheerNotes
That's very strange...I just downloaded my CAMS statement again from 2012 to current and the tool was able to export all the data. Its true that they have added a summary table, but that does not seems to be impacting here.
For now I'm testing this on my Linux Machine...I'll give it a try with Windows as well (sorry struggling with time)..in the meantime can you try uninstalling completely and reinstall..?
I just tried again in Windows 11, and its working fine for me. Looks like some sort of an edge case....maybe can we connect over this weekend using private Chat....?
Can you share just the python code removing UI reference ? I can try running at my end to debug
Here is a code without GUI...but make sure to replace these two items "YOUR PDF FILE PATH" & "YOUR PDF PASSWORD" Please make sure to share your findings...I really want to squash this bug with your help.
import os
import pdfplumber
import re
from pandas import DataFrame
from datetime import datetime
basedir = os.path.dirname(__file__)
def extract_text(doc_txt):
#Defining RegEx patterns
folio_pat = re.compile(r"(^Folio No:\s\d+)") # Extracting Folio information
fund_name = re.compile(r".+Fund.+\s:") #Extracting Fund Name
trans_details = re.compile(r"(^\d{2}-\w{3}-\d{4})(\s.+?\s(?=[\d(]))([\d\(]+[,.]\d+[.\d\)]+)(\s[\d\(\,\.\)]+)(\s[\d\,\.]+)(\s[\d,\.]+)") #Extracting Transaction data
line_itms = []
for i in doc_txt.splitlines():
if fund_name.match(i):
fun_name = i
if folio_pat.match(i):
folio = i
txt = trans_details.search(i)
if txt:
date = txt.group(1)
description = txt.group(2)
amount = txt.group(3)
units = txt.group(4)
price = txt.group(5)
unit_bal = txt.group(6)
line_itms.append([folio,fun_name,date,description,amount,units,price,unit_bal])
df = DataFrame(line_itms,columns=["Folio","Fund_name","Date","Description","Amount","Units","Price","Unit_balance"])
clean_txt(df.Amount)
clean_txt(df.Units)
clean_txt(df.Price)
clean_txt(df.Unit_balance)
df.Amount = df.Amount.astype('float')
df.Units = df.Units.astype('float')
df.Price = df.Price.astype('float')
df.Unit_balance = df.Unit_balance.astype('float')
file_name = f'CAMS_data_NEW_{datetime.now().strftime("%d_%m_%Y_%H_%M")}.csv'
save_file = os.path.join(os.path.expanduser('~'),'Downloads',file_name)
df.to_csv(save_file,index=False)
def clean_txt(x):
x.replace(r",","",inplace=True,regex=True)
x.replace("\(","-",regex=True,inplace=True)
x.replace("\)"," ",regex=True,inplace=True)
return x
final_text = ""
file_path ='YOUR PDF FILE PATH'
if not len(file_path) == 0:
with pdfplumber.open(file_path,password='YOUR PDF PASSWORD') as pdf:
for i in range(len(pdf.pages)):
txt = pdf.pages[i].extract_text()
final_text = final_text + "\n" + txt
pdf.close()
extract_text(doc_txt=final_text)
There are some compile time errors in the code
C:\Program Files\Python311>python.exe C:\Users\sample.py
Traceback (most recent call last):
File "C:\Users\sample.py", line 74, in <module>
extract_text(doc_txt=final_text)
File "C:\Users\sample.py", line 40, in extract_text
line_itms.append([folio,fun_name,date,description,amount,units,price,unit_bal])
^^^^^
UnboundLocalError: cannot access local variable 'fun_name' where it is not associated with a value
Can you try this..? I have removed function to resolve variable assignment conflicts:
import os
import pdfplumber
import re
from pandas import DataFrame
from datetime import datetime
basedir = os.path.dirname(__file__)
def clean_txt(x):
x.replace(r",","",inplace=True,regex=True)
x.replace("\(","-",regex=True,inplace=True)
x.replace("\)"," ",regex=True,inplace=True)
return x
final_text = ""
file_path ='YOUR FILE PATH'
if not len(file_path) == 0:
with pdfplumber.open(file_path,password='YOUR PASSWORD') as pdf:
for i in range(len(pdf.pages)):
txt = pdf.pages[i].extract_text()
final_text = final_text + "\n" + txt
pdf.close()
#Defining RegEx patterns
folio_pat = re.compile(r"(^Folio No:\s\d+)") # Extracting Folio information
fund_name = re.compile(r".+Fund.+\s:") #Extracting Fund Name
trans_details = re.compile(r"(^\d{2}-\w{3}-\d{4})(\s.+?\s(?=[\d(]))([\d\(]+[,.]\d+[.\d\)]+)(\s[\d\(\,\.\)]+)(\s[\d\,\.]+)(\s[\d,\.]+)") #Extracting Transaction data
line_itms = []
for i in final_text.splitlines():
if fund_name.match(i):
fun_name = i
if folio_pat.match(i):
folio = i
txt = trans_details.search(i)
if txt:
date = txt.group(1)
description = txt.group(2)
amount = txt.group(3)
units = txt.group(4)
price = txt.group(5)
unit_bal = txt.group(6)
line_itms.append([folio,fun_name,date,description,amount,units,price,unit_bal])
df = DataFrame(line_itms,columns=["Folio","Fund_name","Date","Description","Amount","Units","Price","Unit_balance"])
clean_txt(df.Amount)
clean_txt(df.Units)
clean_txt(df.Price)
clean_txt(df.Unit_balance)
df.Amount = df.Amount.astype('float')
df.Units = df.Units.astype('float')
df.Price = df.Price.astype('float')
df.Unit_balance = df.Unit_balance.astype('float')
file_name = f'CAMS_data_NEW_{datetime.now().strftime("%d_%m_%Y_%H_%M")}.csv'
save_file = os.path.join(os.path.expanduser('~'),'Downloads',file_name)
df.to_csv(save_file,index=False)
Hi - I have released a new version today (v1.01)...so can you please give it a try..?
Nope still same thing will try with the code now
Only a 1 KB file gets generated on WIndows 11