1# Copyright 2018 The Chromium OS Authors. All rights reserved. 2# Use of this source code is governed by a BSD-style license that can be 3# found in the LICENSE file. 4 5import json 6import md5 7import os 8import requests 9 10# ==================== Documents digests 11 12def _read_lines_with_prefix(document, position, prefix): 13 """ 14 Starting from given position, it parses from the document complete lines 15 (with '\n' character at the end) starting from given prefix. Parser stops 16 on the first line that does not start from the given prefix or when there 17 are no more '\n' characters in the file. 18 19 @param document: a document to parse 20 @param position: an offset in the document to start from 21 22 @returns a pair (lines, position), where the first element is a list of 23 parsed lines (with '\n' character at the end) and the second element 24 is a new offset in the document, pointing at the first character after 25 the last parsed line 26 27 """ 28 lines = [] 29 while document.startswith(prefix, position): 30 position_next_line = document.find('\n', position + len(prefix)) 31 if position_next_line < 0: 32 break 33 position_next_line += 1 # to eat '\n' character 34 lines.append(document[position:position_next_line]) 35 position = position_next_line 36 return lines, position 37 38 39def _process_PJL_headers(doc, position, out): 40 """ 41 The function tries to find a PJL headers in given document and process 42 them as it was described in _normalize_document(doc) function. 43 44 @param doc: see the description of _normalize_document(doc) 45 @param position: offset in the document; defines part of the document that 46 is already processed; searching for headers starts from this 47 position 48 @param out: already processed part of the document (from the beginning to 49 the given position) 50 51 @returns new position and output; the position is set at the end of the last 52 processed PJL header or it is a copy of of input position, if no PJL 53 headers have been foound; the output is adjusted accordingly. 54 55 """ 56 PJL_MARKER = b'\x1B%-12345X' 57 MARGIN = 2048 # max distance to the header 58 position_pjl = doc.find(PJL_MARKER, position, position + MARGIN) 59 while position_pjl >= 0: 60 out += doc[position:(position_pjl+len(PJL_MARKER))] 61 position = position_pjl + len(PJL_MARKER) 62 # parse header and filter problematic lines 63 lines, position = _read_lines_with_prefix(doc, position, '@PJL') 64 for line in lines: 65 if not (line.startswith('@PJL SET ') or 66 line.startswith('@PJL COMMENT') or 67 line.startswith('@PJL DMINFO') or 68 line.startswith('@PJL JOB NAME') or 69 line.startswith('@PJL JOBNAME')): 70 out += line 71 # try to find next PJL header 72 position_pjl = doc.find(PJL_MARKER, position, position + MARGIN) 73 return position, out 74 75 76def _process_PS_Adobe_headers(doc, position, out): 77 """ 78 The function tries to find a PS-Adobe headers in given document and process 79 them as it was described in _normalize_document(doc) function. 80 81 @param doc: see the description of _normalize_document(doc) 82 @param position: offset in the document; defines part of the document that 83 is already processed; searching for headers starts from this 84 position 85 @param out: already processed part of the document (from the beginning to 86 the given position) 87 88 @returns new position and output; the position is set at the end of the last 89 processed PS-Adobe header or it is a copy of of input position, if 90 no PS-Adobe headers have been foound; the output is adjusted 91 accordingly. 92 93 """ 94 PS_MARKER = '%!PS-Adobe' 95 MARGIN = 2048 # max distance to the header 96 position_ps = doc.find(PS_MARKER, position, position + MARGIN) 97 while position_ps >= 0: 98 # add everything till the end of the first line in the header 99 position_next_line = doc.find('\n', position_ps + len(PS_MARKER)) 100 if position_next_line < 0: 101 break # no more '\n', we finish the parsing here 102 position_next_line += 1 # to eat \n character 103 out += doc[position:position_next_line] 104 # parse the rest of the header and filter problematic lines 105 lines, position = _read_lines_with_prefix(doc, position_next_line, '%') 106 for line in lines: 107 if not (line.startswith('%%Title:') or line.startswith('%%For:')): 108 out += line 109 # search for lines with '{setuserinfo}' or '/JobInfo <<' 110 position_ps = doc.find(PS_MARKER, position, position + MARGIN) 111 position_ui = doc.find('{setuserinfo}', position, position + MARGIN) 112 position_ji = doc.find('/JobInfo <<', position, position + MARGIN) 113 # if '/JobInfo <<' was found, move the offset to the end of the section 114 if position_ji >= 0: 115 position_ji = doc.find('>>', position_ji) 116 # if the beginning of the next header was found, make sure that 117 # detected sections do not belong to the next header 118 if position_ps >= 0: 119 if position_ji > position_ps: 120 position_ji = -1 121 if position_ui > position_ps: 122 position_ui = -1 123 # choose the farthest section 124 position_end = max(position_ji, position_ui) 125 if position_end >= 0: 126 # find the first '\n' after the farthest section 127 position_end = doc.find('\n', position_end) 128 if position_end < 0: 129 break # no more '\n', we finish the parsing here 130 # split into lines everything from here to the end of the section 131 lines = doc[position:position_end].split('\n') 132 position = position_end + 1 # +1 is needed to eat the last \n 133 # filter problematic lines 134 for line in lines: 135 if not (line.find('{setuserinfo}') >= 0 or 136 line.find('/UserID') >= 0 or 137 line.find('/Time') >= 0 or 138 line.find('/HostLoginName') >= 0 or 139 line.find('/HostName') >= 0): 140 out += line + '\n' 141 # go to the next iteration, position_ps is already set 142 return position, out 143 144 145def _normalize_LIDIL(doc): 146 """ 147 The function tries to proces given document as it was described in 148 _normalize_document(doc) function, but assuming that the document is in 149 LIDIL format. This format is used by some HP printers. 150 151 @param doc: see the description of _normalize_document(doc) 152 153 @returns None if the give ndocument is not in LIDIL format. Otherwise, it 154 returns a result for _normalize_document(doc) function. 155 156 """ 157 LIDIL_MARKER = b'\x24\x01\x00\x00\x07\x00\x00\x00' 158 LIDIL_JOBID_1_OFF = 2348 # first job id, offset from the beginning 159 LIDIL_JOBID_2_OFF = 2339 # second job id, offset from the end 160 JOBID_SIZE = 4 # number of bytes used to store job id 161 # the document is in LIDIL format <=> it starts with the marker 162 if not doc.startswith(LIDIL_MARKER): 163 return None 164 # remove both JOB IDs and exit 165 nd = len(doc) 166 if nd > LIDIL_JOBID_1_OFF + LIDIL_JOBID_2_OFF + 2*JOBID_SIZE: 167 doc = ''.join([ doc[:(LIDIL_JOBID_1_OFF)], 168 doc[(LIDIL_JOBID_1_OFF+JOBID_SIZE):(nd-LIDIL_JOBID_2_OFF)], 169 doc[(nd-LIDIL_JOBID_2_OFF+JOBID_SIZE):] ]) 170 return doc 171 172 173def _normalize_EJL(doc): 174 """ 175 The function tries to proces given document as it was described in 176 _normalize_document(doc) function, but assuming that the document is in 177 EJL format. 178 179 @param doc: see the description of _normalize_document(doc) 180 181 @returns None if the give ndocument is not in EJL format. Otherwise, it 182 returns a result for _normalize_document(doc) function. 183 184 """ 185 # EJL - some epson printers (like eplaser) 186 EJL_MARKER = b'\x1B\x01@EJL \n' 187 # the document is in EJL format <=> it starts with the marker 188 if not doc.startswith(EJL_MARKER): 189 return None 190 # copy the document to output; filter lines parsed from the EJL header 191 out = EJL_MARKER 192 lines, position = _read_lines_with_prefix(doc, len(EJL_MARKER), '@EJL') 193 for line in lines: 194 if not (line.startswith('@EJL JI ID=') or 195 line.startswith('@EJL JI USER=')): 196 out += line 197 # add the rest of the document and exit 198 out += doc[position:] 199 return out 200 201 202def _normalize_document(doc): 203 """ 204 The input document is a raw package sent to printer. This function removes 205 from it all variables that can change, when the same content is printed. 206 That includes, but is not limited to: user name, host name, job id, date, 207 time. 208 209 @param doc: a raw document sent directly to printer to be printed 210 211 @returns a copy of doc with removed fragments that can vary between 212 printing jobs. The returned output is supposed to be identical for the 213 same input content send to the pipeline for the same PPD file. 214 215 """ 216 # Try to parse the document as LIDIL or EJL and exit if successful. 217 out = _normalize_LIDIL(doc) 218 if out is not None: 219 return out 220 out = _normalize_EJL(doc) 221 if out is not None: 222 return out 223 224 # Try to parse and process PJL and PS headers. 225 position = 0 226 out = '' 227 position, out = _process_PJL_headers(doc, position, out) 228 position, out = _process_PS_Adobe_headers(doc, position, out) 229 230 # Go to the tail of the document, add the skipped content to the output. 231 if position + 2048 < len(doc): 232 position_tail = len(doc) - 2048 233 out += doc[position:position_tail] 234 position = position_tail 235 236 # Try to find 'trailer << '. 237 position_trailer = doc.find('trailer << ', position) 238 if position_trailer >= 0: 239 # If found, prune the line with it. 240 position_end = doc.find('\n', position_trailer) 241 if position_end >= 0: 242 out += doc[position:position_trailer] 243 position = position_end + 1 # +1 to ommit '\n' from the trailer 244 245 # Add the rest of the document to the output. 246 out += doc[position:] 247 248 return out 249 250 251def calculate_digest(doc): 252 """ 253 Calculates digests for given document. 254 255 @param doc: document's content 256 257 @returns calculated digests as a string of hexadecimals 258 259 """ 260 # Prune the variable parts of the document 261 out = _normalize_document(doc) 262 263 # Calculates hash 264 return md5.new(out).hexdigest() 265 266 267def parse_digests_file(path_digests, denylist): 268 """ 269 Parses digests and outputs sizes from file. 270 271 @param path_digests: a path to a file with digests 272 @param denylist: list of keys to omit 273 274 @returns two dictionaries, both indexed by ppd filenames: the first one 275 contains digests, the second one contains output sizes; returns 276 empty dictionaries if the given file does not exist 277 278 """ 279 digests = dict() 280 sizes = dict() 281 denylist = set(denylist) 282 if os.path.isfile(path_digests): 283 with open(path_digests, 'rb') as file_digests: 284 lines = file_digests.read().splitlines() 285 for line in lines: 286 cols = line.split() 287 if len(cols) >= 2 and cols[0] not in denylist: 288 digests[cols[0]] = cols[1] 289 if len(cols) > 2 and len(cols[2]) > 0: 290 sizes[cols[0]] = int(cols[2]) 291 return digests, sizes 292 293 294def save_digests_file(path_digests, digests, sizes, denylist): 295 """ 296 Saves list of digests and output sizes to file. 297 298 @param digests: dictionary with digests (keys are names) 299 @param sizes: dictionary with outputs sizes (keys are names) 300 @param denylist: list of keys to ignore 301 302 @return a content of digests file 303 304 """ 305 digests_content = '' 306 names = sorted(set(digests.keys()).difference(denylist)) 307 for name in names: 308 digest = digests[name] 309 assert name.find('\t') < 0 and name.find('\n') < 0 310 assert digest.find('\t') < 0 and digest.find('\n') < 0 311 digests_content += name + '\t' + digest 312 if name in sizes: 313 assert isinstance(sizes[name], int) 314 digests_content += '\t' + str(sizes[name]) 315 digests_content += '\n' 316 317 with open(path_digests, 'wb') as file_digests: 318 file_digests.write(digests_content) 319 320 321def load_lines_from_file(path): 322 """ 323 Loads strings stored in the given file as separated lines. 324 325 This routine returns lines read from the given file. All leading and 326 trailing whitespace characters in each line are removed. Lines consisting of 327 whitespace characters only are skipped. 328 329 @param path: a path to the input file 330 331 @returns a list of non-empty strings 332 333 """ 334 with open(path) as input_file: 335 lines = input_file.readlines() 336 337 output_list = [] 338 for entry in lines: 339 entry = entry.strip() 340 if entry != '': 341 output_list.append(entry) 342 343 return output_list 344 345 346# ===================== PPD files on the SCS server 347 348def get_filenames_from_PPD_index(task_id): 349 """ 350 It downloads an index file from the SCS server and extracts names 351 of PPD files from it. 352 353 @param task_id: an order number of an index file to process; this is 354 an integer from the interval [0..20) 355 356 @returns a list of PPD filenames (may contain duplicates) 357 358 """ 359 # calculates a URL of the index file 360 url_metadata = 'https://www.gstatic.com/chromeos_printing/metadata_v2/' 361 url_ppd_index = url_metadata + ('index-%02d.json' % task_id) 362 # donwloads and parses the index file 363 request = requests.get(url_ppd_index) 364 entries = json.loads(request.content) 365 # extracts PPD filenames (the second element in each index entry) 366 output = [] 367 for entry in entries: 368 output.append(entry[1]) 369 # returns a list of extracted filenames 370 return output 371 372 373def download_PPD_file(ppd_file): 374 """ 375 It downloads a PPD file from the SCS server. 376 377 @param ppd_file: a filename of PPD file (neither path nor URL) 378 379 @returns content of the PPD file 380 """ 381 url_ppds = 'https://www.gstatic.com/chromeos_printing/ppds/' 382 request = requests.get(url_ppds + ppd_file) 383 return request.content 384 385 386# ==================== Local filesystem 387 388def list_entries_from_directory( 389 path, 390 with_suffixes=None, nonempty_results=False, 391 include_files=True, include_directories=True ): 392 """ 393 It returns all filenames from given directory. Results may be filtered 394 by filenames suffixes or entries types. 395 396 @param path: a path to directory to list files from 397 @param with_suffixes: if set, only entries with given suffixes are 398 returned; it must be a tuple 399 @param nonempty_results: if True then Exception is raised if there is no 400 results 401 @param include_files: if False, then regular files and links are omitted 402 @param include_directories: if False, directories are omitted 403 404 @returns a nonempty list of entries meeting given criteria 405 406 @raises Exception if no matching filenames were found and 407 nonempty_results is set to True 408 409 """ 410 # lists all files from the directory and filter them by given criteria 411 list_of_files = [] 412 for filename in os.listdir(path): 413 path_entry = os.path.join(path, filename) 414 # check type 415 if os.path.isfile(path_entry): 416 if not include_files: 417 continue 418 elif os.path.isdir(path_entry): 419 if not include_directories: 420 continue 421 else: 422 continue 423 # check suffix 424 if with_suffixes is not None: 425 if not filename.endswith(with_suffixes): 426 continue 427 list_of_files.append(filename) 428 # throws exception if no files were found 429 if nonempty_results and len(list_of_files) == 0: 430 message = 'Directory %s does not contain any ' % path 431 message += 'entries meeting the criteria' 432 raise Exception(message) 433 # returns a non-empty list 434 return list_of_files 435