1# Copyright 2018 The Chromium OS Authors. All rights reserved.
2# Use of this source code is governed by a BSD-style license that can be
3# found in the LICENSE file.
4
5import json
6import md5
7import os
8import requests
9
10# ==================== Documents digests
11
12def _read_lines_with_prefix(document, position, prefix):
13    """
14    Starting from given position, it parses from the document complete lines
15    (with '\n' character at the end) starting from given prefix. Parser stops
16    on the first line that does not start from the given prefix or when there
17    are no more '\n' characters in the file.
18
19    @param document: a document to parse
20    @param position: an offset in the document to start from
21
22    @returns a pair (lines, position), where the first element is a list of
23        parsed lines (with '\n' character at the end) and the second element
24        is a new offset in the document, pointing at the first character after
25        the last parsed line
26
27    """
28    lines = []
29    while document.startswith(prefix, position):
30        position_next_line = document.find('\n', position + len(prefix))
31        if position_next_line < 0:
32            break
33        position_next_line += 1  # to eat '\n' character
34        lines.append(document[position:position_next_line])
35        position = position_next_line
36    return lines, position
37
38
39def _process_PJL_headers(doc, position, out):
40    """
41    The function tries to find a PJL headers in given document and process
42    them as it was described in _normalize_document(doc) function.
43
44    @param doc: see the description of _normalize_document(doc)
45    @param position: offset in the document; defines part of the document that
46            is already processed; searching for headers starts from this
47            position
48    @param out: already processed part of the document (from the beginning to
49            the given position)
50
51    @returns new position and output; the position is set at the end of the last
52            processed PJL header or it is a copy of of input position, if no PJL
53            headers have been foound; the output is adjusted accordingly.
54
55    """
56    PJL_MARKER = b'\x1B%-12345X'
57    MARGIN = 2048  # max distance to the header
58    position_pjl = doc.find(PJL_MARKER, position, position + MARGIN)
59    while position_pjl >= 0:
60        out += doc[position:(position_pjl+len(PJL_MARKER))]
61        position = position_pjl + len(PJL_MARKER)
62        # parse header and filter problematic lines
63        lines, position = _read_lines_with_prefix(doc, position, '@PJL')
64        for line in lines:
65            if not (line.startswith('@PJL SET ') or
66                    line.startswith('@PJL COMMENT') or
67                    line.startswith('@PJL DMINFO') or
68                    line.startswith('@PJL JOB NAME') or
69                    line.startswith('@PJL JOBNAME')):
70                out += line
71        # try to find next PJL header
72        position_pjl = doc.find(PJL_MARKER, position, position + MARGIN)
73    return position, out
74
75
76def _process_PS_Adobe_headers(doc, position, out):
77    """
78    The function tries to find a PS-Adobe headers in given document and process
79    them as it was described in _normalize_document(doc) function.
80
81    @param doc: see the description of _normalize_document(doc)
82    @param position: offset in the document; defines part of the document that
83            is already processed; searching for headers starts from this
84            position
85    @param out: already processed part of the document (from the beginning to
86            the given position)
87
88    @returns new position and output; the position is set at the end of the last
89            processed PS-Adobe header or it is a copy of of input position, if
90            no PS-Adobe headers have been foound; the output is adjusted
91            accordingly.
92
93    """
94    PS_MARKER = '%!PS-Adobe'
95    MARGIN = 2048  # max distance to the header
96    position_ps = doc.find(PS_MARKER, position, position + MARGIN)
97    while position_ps >= 0:
98        # add everything till the end of the first line in the header
99        position_next_line = doc.find('\n', position_ps + len(PS_MARKER))
100        if position_next_line < 0:
101            break  # no more '\n', we finish the parsing here
102        position_next_line += 1 # to eat \n character
103        out += doc[position:position_next_line]
104        # parse the rest of the header and filter problematic lines
105        lines, position = _read_lines_with_prefix(doc, position_next_line, '%')
106        for line in lines:
107            if not (line.startswith('%%Title:') or line.startswith('%%For:')):
108                out += line
109        # search for lines with '{setuserinfo}' or '/JobInfo <<'
110        position_ps = doc.find(PS_MARKER, position, position + MARGIN)
111        position_ui = doc.find('{setuserinfo}', position, position + MARGIN)
112        position_ji = doc.find('/JobInfo <<', position, position + MARGIN)
113        # if '/JobInfo <<' was found, move the offset to the end of the section
114        if position_ji >= 0:
115            position_ji = doc.find('>>', position_ji)
116        # if the beginning of the next header was found, make sure that
117        # detected sections do not belong to the next header
118        if position_ps >= 0:
119            if position_ji > position_ps:
120                position_ji = -1
121            if position_ui > position_ps:
122                position_ui = -1
123        # choose the farthest section
124        position_end = max(position_ji, position_ui)
125        if position_end >= 0:
126            # find the first '\n' after the farthest section
127            position_end = doc.find('\n', position_end)
128            if position_end < 0:
129                break  # no more '\n', we finish the parsing here
130            # split into lines everything from here to the end of the section
131            lines = doc[position:position_end].split('\n')
132            position = position_end + 1  # +1 is needed to eat the last \n
133            # filter problematic lines
134            for line in lines:
135                if not (line.find('{setuserinfo}') >= 0 or
136                        line.find('/UserID') >= 0 or
137                        line.find('/Time') >= 0 or
138                        line.find('/HostLoginName') >= 0 or
139                        line.find('/HostName') >= 0):
140                    out += line + '\n'
141            # go to the next iteration, position_ps is already set
142    return position, out
143
144
145def _normalize_LIDIL(doc):
146    """
147    The function tries to proces given document as it was described in
148    _normalize_document(doc) function, but assuming that the document is in
149    LIDIL format. This format is used by some HP printers.
150
151    @param doc: see the description of _normalize_document(doc)
152
153    @returns None if the give ndocument is not in LIDIL format. Otherwise, it
154        returns a result for _normalize_document(doc) function.
155
156    """
157    LIDIL_MARKER = b'\x24\x01\x00\x00\x07\x00\x00\x00'
158    LIDIL_JOBID_1_OFF = 2348 # first job id, offset from the beginning
159    LIDIL_JOBID_2_OFF = 2339 # second job id, offset from the end
160    JOBID_SIZE = 4 # number of bytes used to store job id
161    # the document is in LIDIL format <=> it starts with the marker
162    if not doc.startswith(LIDIL_MARKER):
163        return None
164    # remove both JOB IDs and exit
165    nd = len(doc)
166    if nd > LIDIL_JOBID_1_OFF + LIDIL_JOBID_2_OFF + 2*JOBID_SIZE:
167        doc = ''.join([ doc[:(LIDIL_JOBID_1_OFF)],
168                doc[(LIDIL_JOBID_1_OFF+JOBID_SIZE):(nd-LIDIL_JOBID_2_OFF)],
169                doc[(nd-LIDIL_JOBID_2_OFF+JOBID_SIZE):] ])
170    return doc
171
172
173def _normalize_EJL(doc):
174    """
175    The function tries to proces given document as it was described in
176    _normalize_document(doc) function, but assuming that the document is in
177    EJL format.
178
179    @param doc: see the description of _normalize_document(doc)
180
181    @returns None if the give ndocument is not in EJL format. Otherwise, it
182        returns a result for _normalize_document(doc) function.
183
184    """
185    # EJL - some epson printers (like eplaser)
186    EJL_MARKER = b'\x1B\x01@EJL \n'
187    # the document is in EJL format <=> it starts with the marker
188    if not doc.startswith(EJL_MARKER):
189        return None
190    # copy the document to output; filter lines parsed from the EJL header
191    out = EJL_MARKER
192    lines, position = _read_lines_with_prefix(doc, len(EJL_MARKER), '@EJL')
193    for line in lines:
194        if not (line.startswith('@EJL JI ID=') or
195                line.startswith('@EJL JI USER=')):
196            out += line
197    # add the rest of the document and exit
198    out += doc[position:]
199    return out
200
201
202def _normalize_document(doc):
203    """
204    The input document is a raw package sent to printer. This function removes
205    from it all variables that can change, when the same content is printed.
206    That includes, but is not limited to: user name, host name, job id, date,
207    time.
208
209    @param doc: a raw document sent directly to printer to be printed
210
211    @returns a copy of doc with removed fragments that can vary between
212        printing jobs. The returned output is supposed to be identical for the
213        same input content send to the pipeline for the same PPD file.
214
215    """
216    # Try to parse the document as LIDIL or EJL and exit if successful.
217    out = _normalize_LIDIL(doc)
218    if out is not None:
219        return out
220    out = _normalize_EJL(doc)
221    if out is not None:
222        return out
223
224    # Try to parse and process PJL and PS headers.
225    position = 0
226    out = ''
227    position, out = _process_PJL_headers(doc, position, out)
228    position, out = _process_PS_Adobe_headers(doc, position, out)
229
230    # Go to the tail of the document, add the skipped content to the output.
231    if position + 2048 < len(doc):
232        position_tail = len(doc) - 2048
233        out += doc[position:position_tail]
234        position = position_tail
235
236    # Try to find 'trailer << '.
237    position_trailer = doc.find('trailer << ', position)
238    if position_trailer >= 0:
239        # If found, prune the line with it.
240        position_end = doc.find('\n', position_trailer)
241        if position_end >= 0:
242            out += doc[position:position_trailer]
243            position = position_end + 1  # +1 to ommit '\n' from the trailer
244
245    # Add the rest of the document to the output.
246    out += doc[position:]
247
248    return out
249
250
251def calculate_digest(doc):
252    """
253    Calculates digests for given document.
254
255    @param doc: document's content
256
257    @returns calculated digests as a string of hexadecimals
258
259    """
260    # Prune the variable parts of the document
261    out = _normalize_document(doc)
262
263    # Calculates hash
264    return md5.new(out).hexdigest()
265
266
267def parse_digests_file(path_digests, denylist):
268    """
269    Parses digests and outputs sizes from file.
270
271    @param path_digests: a path to a file with digests
272    @param denylist: list of keys to omit
273
274    @returns two dictionaries, both indexed by ppd filenames: the first one
275            contains digests, the second one contains output sizes; returns
276            empty dictionaries if the given file does not exist
277
278    """
279    digests = dict()
280    sizes = dict()
281    denylist = set(denylist)
282    if os.path.isfile(path_digests):
283        with open(path_digests, 'rb') as file_digests:
284            lines = file_digests.read().splitlines()
285            for line in lines:
286                cols = line.split()
287                if len(cols) >= 2 and cols[0] not in denylist:
288                    digests[cols[0]] = cols[1]
289                    if len(cols) > 2 and len(cols[2]) > 0:
290                        sizes[cols[0]] = int(cols[2])
291    return digests, sizes
292
293
294def save_digests_file(path_digests, digests, sizes, denylist):
295    """
296    Saves list of digests and output sizes to file.
297
298    @param digests: dictionary with digests (keys are names)
299    @param sizes: dictionary with outputs sizes (keys are names)
300    @param denylist: list of keys to ignore
301
302    @return a content of digests file
303
304    """
305    digests_content = ''
306    names = sorted(set(digests.keys()).difference(denylist))
307    for name in names:
308        digest = digests[name]
309        assert name.find('\t') < 0 and name.find('\n') < 0
310        assert digest.find('\t') < 0 and digest.find('\n') < 0
311        digests_content += name + '\t' + digest
312        if name in sizes:
313            assert isinstance(sizes[name], int)
314            digests_content += '\t' + str(sizes[name])
315        digests_content += '\n'
316
317    with open(path_digests, 'wb') as file_digests:
318        file_digests.write(digests_content)
319
320
321def load_lines_from_file(path):
322    """
323    Loads strings stored in the given file as separated lines.
324
325    This routine returns lines read from the given file. All leading and
326    trailing whitespace characters in each line are removed. Lines consisting of
327    whitespace characters only are skipped.
328
329    @param path: a path to the input file
330
331    @returns a list of non-empty strings
332
333    """
334    with open(path) as input_file:
335        lines = input_file.readlines()
336
337    output_list = []
338    for entry in lines:
339        entry = entry.strip()
340        if entry != '':
341            output_list.append(entry)
342
343    return output_list
344
345
346# ===================== PPD files on the SCS server
347
348def get_filenames_from_PPD_index(task_id):
349    """
350    It downloads an index file from the SCS server and extracts names
351    of PPD files from it.
352
353    @param task_id: an order number of an index file to process; this is
354            an integer from the interval [0..20)
355
356    @returns a list of PPD filenames (may contain duplicates)
357
358    """
359    # calculates a URL of the index file
360    url_metadata = 'https://www.gstatic.com/chromeos_printing/metadata_v2/'
361    url_ppd_index = url_metadata + ('index-%02d.json' % task_id)
362    # donwloads and parses the index file
363    request = requests.get(url_ppd_index)
364    entries = json.loads(request.content)
365    # extracts PPD filenames (the second element in each index entry)
366    output = []
367    for entry in entries:
368        output.append(entry[1])
369    # returns a list of extracted filenames
370    return output
371
372
373def download_PPD_file(ppd_file):
374    """
375    It downloads a PPD file from the SCS server.
376
377    @param ppd_file: a filename of PPD file (neither path nor URL)
378
379    @returns content of the PPD file
380    """
381    url_ppds = 'https://www.gstatic.com/chromeos_printing/ppds/'
382    request = requests.get(url_ppds + ppd_file)
383    return request.content
384
385
386# ==================== Local filesystem
387
388def list_entries_from_directory(
389        path,
390        with_suffixes=None, nonempty_results=False,
391        include_files=True, include_directories=True ):
392    """
393    It returns all filenames from given directory. Results may be filtered
394    by filenames suffixes or entries types.
395
396    @param path: a path to directory to list files from
397    @param with_suffixes: if set, only entries with given suffixes are
398            returned; it must be a tuple
399    @param nonempty_results: if True then Exception is raised if there is no
400            results
401    @param include_files: if False, then regular files and links are omitted
402    @param include_directories: if False, directories are omitted
403
404    @returns a nonempty list of entries meeting given criteria
405
406    @raises Exception if no matching filenames were found and
407            nonempty_results is set to True
408
409    """
410    # lists all files from the directory and filter them by given criteria
411    list_of_files = []
412    for filename in os.listdir(path):
413        path_entry = os.path.join(path, filename)
414        # check type
415        if os.path.isfile(path_entry):
416            if not include_files:
417                continue
418        elif os.path.isdir(path_entry):
419            if not include_directories:
420                continue
421        else:
422            continue
423        # check suffix
424        if with_suffixes is not None:
425            if not filename.endswith(with_suffixes):
426                continue
427        list_of_files.append(filename)
428    # throws exception if no files were found
429    if nonempty_results and len(list_of_files) == 0:
430        message = 'Directory %s does not contain any ' % path
431        message += 'entries meeting the criteria'
432        raise Exception(message)
433    # returns a non-empty list
434    return list_of_files
435