1
+ import openpyxl , re , os , csv
2
+ from openpyxl .utils import get_column_letter
3
+ from utils import *
4
+
5
+ from typing import TypeAlias
6
+ from openpyxl .worksheet .worksheet import Worksheet
7
+
8
+ ExtractedData : TypeAlias = tuple [tuple [str , ...], ...]
9
+ Patterns : TypeAlias = list [str ]
10
+
11
+ class CSVFileExtractor :
12
+ def __init__ (self , col_var ):
13
+ self .col_var = col_var
14
+
15
+ def create_csv_file (self , output_file : str , patterns : Patterns ,content : str ) -> str | None :
16
+ extracted_data = DataExtractor .extract_data (patterns ,content )
17
+
18
+ if WithLogging .with_logging :
19
+ extracted_data_copy = extracted_data
20
+
21
+ if self .col_var .get ():
22
+ extracted_data = DataExtractor .create_column_order (extracted_data )
23
+
24
+ with open (output_file ,'a' ,newline = '' ,encoding = ENCODING ) as f :
25
+ writer = csv .writer (f )
26
+ writer .writerows (extracted_data )
27
+
28
+ if WithLogging .with_logging :
29
+ return DataExtractor .log_found_data (extracted_data_copy )
30
+
31
+ class ExcelFileExtractor :
32
+ def __init__ (self , col_var , exact_var ):
33
+ self .col_var = col_var
34
+ self .exact_var = exact_var
35
+
36
+ @staticmethod
37
+ def find_max (index : int , sheet : Worksheet ) -> int :
38
+ row = 0
39
+ for i in sheet .iter_rows (min_col = index ,max_col = index ):
40
+ if i [0 ].value is not None :
41
+ row = i [0 ].row
42
+ return row
43
+
44
+ @staticmethod
45
+ def put_data_in_excel_without_exact_order (extracted_data : ExtractedData , sheet : Worksheet ) -> None :
46
+ for data_list in extracted_data :
47
+ sheet .append (data_list )
48
+
49
+ @staticmethod
50
+ def get_cell (pattern_letter : str , row_number : int ):
51
+ return pattern_letter + str (row_number )
52
+
53
+ @staticmethod
54
+ def put_data_in_excel_with_exact_order (extracted_data : ExtractedData , sheet : Worksheet ) -> None :
55
+ column_letters_list = [get_column_letter (i ) for i in range (1 ,len (extracted_data )+ 1 )]
56
+
57
+ find_max_index = 1
58
+ columns_list_index = 0
59
+
60
+ for data_list in extracted_data :
61
+ row_number = ExcelFileExtractor .find_max (find_max_index ,sheet ) + 1
62
+ for item in data_list :
63
+ sheet [ExcelFileExtractor .get_cell (column_letters_list [columns_list_index ],row_number )] = item
64
+ row_number += 1
65
+ columns_list_index += 1
66
+ find_max_index += 1
67
+
68
+ def create_excel_file (self , output_file : str , sheet_name : str , patterns : Patterns , content : str ) -> str | None :
69
+ if not os .path .isfile (output_file ):
70
+ wb = openpyxl .Workbook ()
71
+ wb .save (output_file )
72
+ wb .close ()
73
+
74
+ sheet_name = sheet_name .title ()
75
+
76
+ wb = openpyxl .load_workbook (output_file )
77
+
78
+ if sheet_name in wb .sheetnames :
79
+ sheet = wb [sheet_name ]
80
+ else :
81
+ sheet = wb .create_sheet (sheet_name )
82
+
83
+ extracted_data = DataExtractor .extract_data (patterns ,content )
84
+
85
+ if WithLogging .with_logging :
86
+ extracted_data_copy = extracted_data
87
+
88
+ if self .col_var .get () and not self .exact_var .get ():
89
+ extracted_data = DataExtractor .create_column_order (extracted_data )
90
+
91
+ if not self .exact_var .get (): # The codes in this if statement will not be executed if 'put in rows' is enabled
92
+ ExcelFileExtractor .put_data_in_excel_without_exact_order (extracted_data ,sheet )
93
+ else :
94
+ ExcelFileExtractor .put_data_in_excel_with_exact_order (extracted_data ,sheet )
95
+
96
+ wb .save (output_file )
97
+ wb .close ()
98
+
99
+ if WithLogging .with_logging :
100
+ return DataExtractor .log_found_data (extracted_data_copy )
101
+
102
+ class DataExtractor :
103
+ def __init__ (self , excel_var , log_text , col_var , exact_var ):
104
+ self .log_text = log_text
105
+ self .excel_var = excel_var
106
+
107
+ self .excel_extractor = ExcelFileExtractor (col_var , exact_var )
108
+ self .csv_extractor = CSVFileExtractor (col_var )
109
+
110
+ @staticmethod
111
+ def extract_data (patterns : Patterns , content : str ) -> ExtractedData :
112
+ extracted_data = []
113
+ for pattern in patterns :
114
+ data_list = re .findall (pattern ,content )
115
+ extracted_data .append (data_list )
116
+
117
+ return extracted_data
118
+
119
+ def prepare_to_extract_data (self , output_file : str , input_file : str , sheet_name : str , patterns : Patterns ) -> None :
120
+ try :
121
+ assert patterns , 'There is no patterns to extract data'
122
+
123
+ with open (input_file ,encoding = ENCODING ) as f :
124
+ try :
125
+ content = f .read ()
126
+ except UnicodeDecodeError :
127
+ raise ValueError ('The input file cannot be a binary file' )
128
+
129
+ assert output_file , 'The name of output file is required.'
130
+
131
+ output_file_extention = os .path .splitext (output_file )[1 ].lower ()
132
+
133
+ if self .excel_var .get ():
134
+ if output_file_extention in ['.xlsx' , '.xlsm' , '.xltx' , '.xltm' ]:
135
+ log_string = self .excel_extractor .create_excel_file (output_file ,sheet_name ,patterns ,content )
136
+ else :
137
+ raise ValueError ('The output file format is not supported. It should be .xlsx, .xlsm, .xltx or .xltm' )
138
+
139
+ else :
140
+ log_string = self .csv_extractor .create_csv_file (output_file ,patterns ,content )
141
+
142
+ if WithLogging .with_logging :
143
+ log_string += f'\n { output_file !r} saved.' + '\n '
144
+ self .log_text .config (state = 'normal' )
145
+ self .log_text .delete ('1.0' ,'end' )
146
+ self .log_text .insert ('end' , log_string )
147
+ self .log_text .config (state = 'disabled' )
148
+ self .log_text .see ('end' )
149
+
150
+ except (FileNotFoundError , AssertionError , PermissionError , ValueError , re .PatternError ) as err :
151
+ show_error (err )
152
+
153
+ @staticmethod
154
+ def log_found_data (extracted_data_copy : ExtractedData ) -> str :
155
+ log_string = ''
156
+
157
+ for data_list in extracted_data_copy :
158
+ log_string += '\n ' .join (data_list ) + '\n '
159
+
160
+ return log_string
161
+
162
+ @staticmethod
163
+ def create_column_order (extracted_data : ExtractedData ) -> tuple [tuple [str ]]:
164
+ max_len = max ([len (data_list ) for data_list in extracted_data ])
165
+
166
+ for data_list in extracted_data :
167
+ for _ in range (max_len - len (data_list )):
168
+ data_list .append ('' )
169
+
170
+ return tuple (zip (* extracted_data ))
0 commit comments