0% found this document useful (0 votes)
18 views3 pages

DATA-Code 1-050624-120338

Uploaded by

geraldo.junior
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
18 views3 pages

DATA-Code 1-050624-120338

Uploaded by

geraldo.junior
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 3

Code 1

1 class RunPipelineCasinoVentes(RunPipeline):
2 retailer_name = CASINO_RETAILER_NAME
3 schema_input = SCHEMA
4
5 def processing_data(self, element):
6 element = self.add_technical_fields_processing(element, element)
7 typologie = element[VentesTargetOutput.typologie].strip()
8 is_regex = re.search(r"^(322247|360282).+", element[VentesTargetOutput.ean])
9 if not is_regex and typologie == "2":
10 element[VentesTargetOutput.typologie] = "MN"
11 elif not is_regex and typologie == "1":
12 element[VentesTargetOutput.typologie] = "MDD"
13 elif is_regex and typologie in ("0", "1"):
14 element[VentesTargetOutput.typologie] = "MDD"
15 else:
16 element[VentesTargetOutput.typologie] = None
17
18 PREFIX_S = "S"
19 code_fournisseur = element[VentesTargetOutput.code_fournisseur]
20 if code_fournisseur is not None:
21 code_fournisseur = code_fournisseur.strip()
22 if not code_fournisseur.startswith(PREFIX_S):
23 code_fournisseur = PREFIX_S + code_fournisseur
24 element[VentesTargetOutput.code_fournisseur] = code_fournisseur[:7]
25 return element
26
27 @staticmethod
28 def preprocessing_data(element, schema_input):
29 return preprocessing_common(element, schema_input)
30
31 def get_reader(self):
32 return CSVReader(encoding=self.encoding)
33
34 def apply_processing_data(self, preprocessed_data, pipeline):
35 keys = (
36 VentesTargetOutput.annee_mois,
37 VentesTargetOutput.ean,
38 VentesTargetOutput.code_fournisseur,
39 VentesTargetOutput.nom_fournisseur,
40 VentesTargetOutput.promo,
41 VentesTargetOutput.code_interne,
42 VentesTargetOutput.code_ue,
43 VentesTargetOutput.code_famille,
44 VentesTargetOutput.libelle_pdt,
45 VentesTargetOutput.typologie,
46 )
47 fields_to_sum = (
48 VentesTargetOutput.ca_ventes_ht,
49 VentesTargetOutput.ca_ventes_ttc,
50 VentesTargetOutput.ca_srp,
51 VentesTargetOutput.volume,
52 )
53 individual_fields = VentesTargetOutput().get_technical_fields_preprocessing()
54 processing = (
55 preprocessed_data
56 | NORMALIZED_COLUMNS
57 >> beam.Map(
58 self.get_normalized_columns,
59 schema_input=self.schema_input,
60 individual_fields=individual_fields,
61 )
62 | sum_values(keys, fields_to_sum, individual_fields)
63 | "processing" >> beam.Map(self.processing_data)
64 )
65 return processing
66

1 class RunPipeline:
2 retailer_name = None
3 is_processing = True
4 schema_input = None
5
6 def __init__(
7 self,
8 input_data: str,
9 options: PipelineOptions,
10 date_file: str,
11 debug_mode: bool,
12 encoding: str,
13 table_name_preprocessing: str,
14 table_name_processing: str,
15 ):
16 self.input_data = input_data
17 self.options = options
18 self.date_file = date_file
19 self.debug_mode = debug_mode
20 self.encoding = encoding
21 self.table_name_preprocessing = table_name_preprocessing
22 self.table_name_processing = table_name_processing
23 self.is_file = True if self.input_data.startswith("gs://") else False
24
25 def keep_idempotence(self, pipeline):
26 empty_pcollection = pipeline | "CreateEmptyPBegin" >> beam.Create([self.input_data])
27 if self.is_file:
28 empty_pcollection = empty_pcollection | "checkFileExist" >> beam.Map(check_file_exists)
29
30 file_path = empty_pcollection | "DeleteAndReturnFilePath" >> beam.Map(
31 delete_and_return_file_path,
32 table_preprocessing=self.table_name_preprocessing,
33 table_processing=self.table_name_processing,
34 column_name_preprocessing=Table.date_file,
35 column_name_processing=Table.execution_date,
36 date_file=self.date_file,
37 is_processing=self.is_processing,
38 )
39 return file_path
40
41 def write_processing(self, processed_data):
42 if self.debug_mode:
43 processed_data | "Printing before WriteToBigQuery Processing" >> beam.Map(print)
44 else:
45 processed_data | "WriteProcessing" >> WriteToBigQuery(
46 table=self.table_name_processing,
47 create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER,
48 write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND,
49 )
50
51 def write_preprocessing(self, preprocessed_data):
52 if self.debug_mode:
53 preprocessed_data | "Printing before WriteToBigQuery Preprocessing" >> beam.Map(print)
54 else:
55 preprocessed_data | "WritePreprocessing" >> WriteToBigQuery(
56 table=self.table_name_preprocessing,
57 create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER,
58 write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND,
59 )
60
61 @staticmethod
62 def add_technical_fields_processing(
63 element_processing: Dict[str, Any],
64 element_preprocessing: Dict[str, Any],
65 ) -> Dict[str, Any]:
66 element_processing[Table.processing_ts] = datetime.utcnow()
67 element_processing[Table.preprocessing_ts] = element_preprocessing[Table.preprocessing_ts].isoformat()
68 element_processing[Table.source_files] = element_preprocessing.pop(Table.source_file)
69 element_processing[Table.execution_date] = element_preprocessing.pop(Table.date_file)
70 element_processing[Table.retailer] = element_preprocessing[Table.retailer]
71 return element_processing
72
73 def add_technical_fields_preprocessing(
74 self,
75 element: Dict[str, Any],
76 source_file: str,
77 date_file: str,
78 retailer_name: Any,
79 ) -> Dict[str, Any]:
80
81 element[Table.preprocessing_ts] = datetime.utcnow()
82 element[Table.source_file] = source_file
83 element[Table.date_file] = date_file
84 if isinstance(retailer_name, dict):
85 element[Table.retailer] = next(
86 (value for key, value in retailer_name.items() if element[ENSEIGNE] == key),
87 "null",
88 )
89 else:
90 element[Table.retailer] = retailer_name
91 return element
92
93 @staticmethod
94 def processing_data(element):
95 return element
96
97 def apply_processing_data(self, preprocessed_data, pipeline):
98 return preprocessed_data | "processing" >> beam.Map(self.processing_data)
99
100 @staticmethod
101 def preprocessing_data(element: Dict[str, Any], schema_input) -> Dict[str, Any]:
102 return element
103
104 def apply_preprocessing_data(self, reader_data):
105 return (
106 reader_data
107 | "preprocessing" >> beam.Map(self.preprocessing_data, schema_input=self.schema_input)
108 | "TechnicalsFieldsPreprocessing"
109 >> beam.Map(
110 self.add_technical_fields_preprocessing,
111 source_file=self.input_data,
112 date_file=self.date_file,
113 retailer_name=self.retailer_name,
114 )
115 )
116
117 def get_reader(self):
118 pass
119
120 @staticmethod
121 def get_normalized_columns(
122 element,
123 schema_input,
124 individual_fields=[],
125 ):
126 result = {}
127 for k, v in schema_input.items():
128 if NORMALIZED_COL_NAME in v.keys():
129 result[v[NORMALIZED_COL_NAME]] = element[k]
130 for k in individual_fields:
131 result[k] = element[k]
132 return result
133
134 def run(self) -> None:
135 with beam.Pipeline(options=self.options) as pipeline:
136 file_path = self.keep_idempotence(pipeline)
137
138 reader_data = file_path | "ReadAndparse" >> beam.ParDo(self.get_reader())
139
140 preprocessed_data = self.apply_preprocessing_data(reader_data)
141
142 self.write_preprocessing(preprocessed_data)
143
144 if self.is_processing:
145 processed_data = self.apply_processing_data(preprocessed_data, pipeline=pipeline)
146
147 self.write_processing(processed_data)
148

You might also like