Open alex8937 opened 8 months ago
We design different regex patterns according to the grammar characteristics of different languages.
For examples, for c
language, we use #inlcude
to find dependency:
def extract_includes(file_content):
includes = set()
include_pattern = r'#include[ \t]+"([^"]+)"|#include[ \t]+<([^>]+)>'
for match in re.finditer(include_pattern, file_content):
include_path = match.group(1) or match.group(2)
includes.add(include_path)
return includes
def find_dependencies(file_paths, contents):
extensions = [".c", ".h"]
file_paths = [file_name for file_name in file_paths if any(file_name.endswith(ext) for ext in extensions)]
dependencies = []
for file_path_A in file_paths:
content_A = contents[file_path_A]
if not content_A.strip():
continue
includes_A = extract_includes(content_A)
for file_path_B in file_paths:
if file_path_A != file_path_B:
file_name_B = os.path.basename(file_path_B)
for include in includes_A:
if include == file_name_B or include.endswith("/" + file_name_B):
dependencies.append((file_path_B, file_path_A))
break
return dependencies
Could you share more details of regex used in Dependency Parsing for each language?