Thank you Mark for making Obsidian more accessible to Python users!! :-)
I was giving it a try with 40.076 files (incl. attachments). (Most of the MD files are generated and do not yet contain a lot of links and metadata.)
The method "gather" ran successfully in about 3 minutes! :-)
However, df = vault.get_all_file_metadata showed an error message.
Not sure if the following is of help to locate an issue.
ValueError Traceback (most recent call last)
Input In [12], in <cell line: 1>()
----> 1 df = vault.get_all_file_metadata()
File C:...\obsidiantools\api.py:1345, in Vault.get_all_file_metadata(self)
1343 warnings.warn('Only notes (md files) were used to build the graph. Set attachments=True in the connect method to show all file metadata.')
1344 else:
-> 1345 df_media = self.get_media_file_metadata()
1346 df_media['graph_category'] = np.where(
1347 df_media['file_exists'], 'attachment', 'nonexistent')
1348 df_canvas = self.get_canvas_file_metadata()
File C:...\obsidiantools\api.py:1249, in Vault._create_media_file_metadata_columns(self, df)
1242 df['abs_filepath'] = np.where(df['rel_filepath'].notna(),
1243 [self._dirpath / str(f)
1244 for f in df['rel_filepath'].tolist()],
1245 np.NaN)
1246 df['file_exists'] = pd.Series(
1247 np.logical_not(df.index.isin(self._nonexistent_media_files)),
1248 index=df.index)
-> 1249 df['n_backlinks'] = self._get_backlink_counts_for_media_files_only()
1250 df['modified_time'] = pd.to_datetime(
1251 [f.lstat().st_mtime if not pd.isna(f)
1252 else pd.NaT
1253 for f in df['abs_filepath'].tolist()],
1254 unit='s')
1255 return df
File C:...\pandas\core\frame.py:3655, in DataFrame.setitem(self, key, value)
3652 self._setitem_array([key], value)
3653 else:
3654 # set column
-> 3655 self._set_item(key, value)
File C:...\pandas\core\frame.py:3832, in DataFrame._set_item(self, key, value)
3822 def _set_item(self, key, value) -> None:
3823 """
3824 Add series to DataFrame in specified column.
3825
(...)
3830 ensure homogeneity.
3831 """
-> 3832 value = self._sanitize_column(value)
3834 if (
3835 key in self.columns
3836 and value.ndim == 1
3837 and not is_extension_array_dtype(value)
3838 ):
3839 # broadcast across multiple columns if necessary
3840 if not self.columns.is_unique or isinstance(self.columns, MultiIndex):
File C:...\pandas\core\common.py:557, in require_length_match(data, index)
553 """
554 Check the length of data matches the length of the index.
555 """
556 if len(data) != len(index):
--> 557 raise ValueError(
558 "Length of values "
559 f"({len(data)}) "
560 "does not match length of index "
561 f"({len(index)})"
562 )
ValueError: Length of values (38135) does not match length of index (4216)
Thank you Mark for making Obsidian more accessible to Python users!! :-)
I was giving it a try with 40.076 files (incl. attachments). (Most of the MD files are generated and do not yet contain a lot of links and metadata.)
The method "gather" ran successfully in about 3 minutes! :-)
However,
df = vault.get_all_file_metadata
showed an error message. Not sure if the following is of help to locate an issue.ValueError Traceback (most recent call last) Input In [12], in <cell line: 1>() ----> 1 df = vault.get_all_file_metadata()
File C:...\obsidiantools\api.py:1345, in Vault.get_all_file_metadata(self) 1343 warnings.warn('Only notes (md files) were used to build the graph. Set attachments=True in the connect method to show all file metadata.') 1344 else: -> 1345 df_media = self.get_media_file_metadata() 1346 df_media['graph_category'] = np.where( 1347 df_media['file_exists'], 'attachment', 'nonexistent') 1348 df_canvas = self.get_canvas_file_metadata()
File C:...\obsidiantools\api.py:1234, in Vault.get_media_file_metadata(self) 1232 return df 1233 else: -> 1234 df = df.pipe(self._create_media_file_metadata_columns) 1235 return df
File C:...\pandas\core\generic.py:5512, in NDFrame.pipe(self, func, args, kwargs) 5454 @final 5455 @doc(klass=_shared_doc_kwargs["klass"]) 5456 def pipe( (...) 5460 kwargs, 5461 ) -> T: 5462 r""" 5463 Apply chainable functions that expect Series or DataFrames. 5464 (...) 5510 ... ) # doctest: +SKIP 5511 """ -> 5512 return com.pipe(self, func, args, **kwargs)
File C:...\pandas\core\common.py:497, in pipe(obj, func, *args, kwargs) 495 return func(*args, *kwargs) 496 else: --> 497 return func(obj, args, kwargs)
File C:...\obsidiantools\api.py:1249, in Vault._create_media_file_metadata_columns(self, df) 1242 df['abs_filepath'] = np.where(df['rel_filepath'].notna(), 1243 [self._dirpath / str(f) 1244 for f in df['rel_filepath'].tolist()], 1245 np.NaN) 1246 df['file_exists'] = pd.Series( 1247 np.logical_not(df.index.isin(self._nonexistent_media_files)), 1248 index=df.index) -> 1249 df['n_backlinks'] = self._get_backlink_counts_for_media_files_only() 1250 df['modified_time'] = pd.to_datetime( 1251 [f.lstat().st_mtime if not pd.isna(f) 1252 else pd.NaT 1253 for f in df['abs_filepath'].tolist()], 1254 unit='s') 1255 return df
File C:...\pandas\core\frame.py:3655, in DataFrame.setitem(self, key, value) 3652 self._setitem_array([key], value) 3653 else: 3654 # set column -> 3655 self._set_item(key, value)
File C:...\pandas\core\frame.py:3832, in DataFrame._set_item(self, key, value) 3822 def _set_item(self, key, value) -> None: 3823 """ 3824 Add series to DataFrame in specified column. 3825 (...) 3830 ensure homogeneity. 3831 """ -> 3832 value = self._sanitize_column(value) 3834 if ( 3835 key in self.columns 3836 and value.ndim == 1 3837 and not is_extension_array_dtype(value) 3838 ): 3839 # broadcast across multiple columns if necessary 3840 if not self.columns.is_unique or isinstance(self.columns, MultiIndex):
File C:...\pandas\core\frame.py:4538, in DataFrame._sanitize_column(self, value) 4535 return _reindex_for_setitem(value, self.index) 4537 if is_list_like(value): -> 4538 com.require_length_match(value, self.index) 4539 return sanitize_array(value, self.index, copy=True, allow_2d=True)
File C:...\pandas\core\common.py:557, in require_length_match(data, index) 553 """ 554 Check the length of data matches the length of the index. 555 """ 556 if len(data) != len(index): --> 557 raise ValueError( 558 "Length of values " 559 f"({len(data)}) " 560 "does not match length of index " 561 f"({len(index)})" 562 )
ValueError: Length of values (38135) does not match length of index (4216)