import logging, json, os, re, urllib3
import numpy as np
from bs4 import BeautifulSoup
from collections import namedtuple
from pathlib import Path
from PIL import Image, ImageDraw
from subprocess import run

from .constants import EMOJI_NAMES
from . import utils

MEMGATOR_HOST = 'https://memgator.cs.odu.edu'

LOG_FORMAT = logging.Formatter('%(asctime)s %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S')
CRAWL_SCRIPT = Path(utils.rootDir(), 'crawler', 'crawler.js')
CRAWL_DATA_FILES = [
    'css.jsonl',
    'js.jsonl',
    'iframe.jsonl',
    'text.jsonl',
    'image.jsonl',
    'video.jsonl',
]
NETWORK_FILES = [
    'requests.jsonl',
    'requests_failed.jsonl',
    'requests_pending.jsonl',
    'responses.jsonl',
    'redirects.jsonl',
]

Rectangle = namedtuple('Rectangle', 'xmin ymin xmax ymax')

VIEWPORT_SIZE = (1920, 1080)

# Augmented image sizes
S_EMOJI = (32, 32)
S_AVATAR = (50, 50)
S_THUMBNAIL = (150, 150)
S_IMAGE_SM = (300, 200)
S_IMAGE_LG = (500, 400)
S_VIDEO_SM = (854, 480)
S_VIDEO_LG = (1280, 720)

# Category weights
W_CSS = 0.1
W_JS = 0.1
W_TEXT = 0.1
W_IMAGE = 0.3
W_VIDEO = 0.4

# WORD_IMG_RATIO = 1000

URI_BLACKLIST = [
    'https://analytics.archive.org/',
    'http://analytics.archive.org/',
    'https://web.archive.org/static',
    'http://web.archive.org/static',
    # '[INTERNAL]',
]


class DamageAnalysis:

    def __init__(self, cacheDir, uri, warcDir=None, warcFile=None, options=None):
        self.debug, \
        self.ignoreCache, \
        self.logLevel, \
        self.timeout, \
        self.viewport = options if options else (False, False, logging.WARN, 30, (1920, 1080))

        self._log = None

        self.uri: str = uri
        self.warcDir = warcDir
        self.warcFile = warcFile
        self.cacheDir: str = cacheDir

        self.error = None
        self.result = None

        self._crawlData = None
        self._pageData = None
        self._precursors = None
        # self._redirectMap = None
        # self._urirRedirects = None

        self.potentialDamage = {
            'video': [],
            'iframe': [],
            'image': [],
            'text': [],
            'css': [],
            'js': [],
        }
        self.actualDamage = {
            'video': [],
            'iframe': [],
            'image': [],
            'text': [],
            'css': [],
            'js': [],
        }

        self._template = None
        self._annotations = {}

        # Initialize cache and logging
        if not Path(self.cacheDir).is_dir():
            if self.debug: print(f'Creating cache subdirectory: {self.cacheDir}')
            utils.mkDir(self.cacheDir)

        logMode = 'a' if Path(self.cacheDir, 'analysis.log').is_file() and not self.ignoreCache else 'w'
        fileHandler = logging.FileHandler(Path(self.cacheDir, 'analysis.log'), mode=logMode)
        fileHandler.setFormatter(LOG_FORMAT)
        self._log = logging.getLogger(self.uri)
        self._log.addHandler(fileHandler)
        self._log.setLevel(self.logLevel)

        self._loadTemplate()

        if not self.ignoreCache:
            self._loadCachedData()


    def _closeLog(self) -> None:
        for h in self.logger.handlers:
            h.flush()
            h.close()
            self.logger.removeHandler(h)


    def _loadCachedData(self) -> None:
        self._log.info('Loading cached analysis data')

        # Network files
        try:
            self._netData = {}
            for netFile in NETWORK_FILES:
                if not Path(self.cacheDir, 'net', netFile).is_file(): continue
                with open(Path(self.cacheDir, 'net', netFile), mode='r', encoding='utf-8') as f:
                    self._netData[netFile[:-len('.jsonl')]] = [json.loads(line) for line in list(f)]
        except Exception as e:
            print(e)
            self._netData = None

        # Data files
        try:
            self._crawlData = {}
            for dataFile in CRAWL_DATA_FILES:
                if not Path(self.cacheDir, 'data', dataFile).is_file(): continue
                with open(Path(self.cacheDir, 'data', dataFile), mode='r', encoding='utf-8') as f:
                    self._crawlData[dataFile[:-len('.jsonl')]] = [json.loads(line) for line in list(f)]
        except Exception as e:
            print(e)
            self._crawlData = None
            self._pageData = None

        # Existing analysis data
        try:
            if (Path(self.cacheDir, 'results.json').is_file()):
                self._log.info('Loading result data')
                with open(Path(self.cacheDir, 'data', 'precursors.jsonl'), mode='r', encoding='utf-8') as f:
                    self._precursors = json.loads(f.read())
                with open(Path(self.cacheDir, 'results.json'), mode='r', encoding='utf-8') as f:
                    self.result = json.loads(f.read())
            else:
                self._precursors = None
                self.result = None
        except:
            self._log.error(f'Unable to load result data')
            self._precursors = None
            self.result = None


    def _downloadTimemaps(self) -> None:
        if Path(self.cacheDir, 'page', 'timemap.cdxj').is_file(): return

        uriR = self.uri[self.uri.rfind('http'):]
        memgatorURI = f'{MEMGATOR_HOST}/timemap/cdxj/{uriR}'
        self._log.info(f'Grabbing CDX timemap: {memgatorURI}')

        http = urllib3.PoolManager()
        res = http.request('GET', memgatorURI)
        if res.status == 200:
            try:
                resBody = res.data.decode("utf-8")
                with open(Path(self.cacheDir, 'page', 'timemap.cdxj'), 'w') as f:
                    f.write(resBody)
            except:
                self._log.error('Unable to save timemap data')
        else:
            self._log.error('Unable to retrieve timemap')


    def _initiatePageCrawl(self) -> None:
        if self._crawlData and not self.ignoreCache:
            self._log.info('Existing crawl data, skipping...')
            return

        # if self.warcFile is None: self._downloadTimemaps()

        # node crawler.js [warcFile] [cache] [debug] [log_level] [timeout] [viewport] [uri]
        crawlCommand = ['node', str(CRAWL_SCRIPT)]
        if self.debug: crawlCommand.append('-d')
        crawlCommand.append(f'--cache={str(self.cacheDir)}')
        crawlCommand.append(f'--log={str(self.logLevel)}')
        crawlCommand.append(f'--timeout={str(self.timeout)}')
        crawlCommand.append(f'--viewport={self.viewport[0]}_{self.viewport[1]}')
        if self.warcFile and self.warcDir:
            crawlCommand.append(f'--warcDir={str(self.warcDir)}')
            crawlCommand.append(f'--warcFile={str(self.warcFile)}')
        crawlCommand.append(self.uri)

        # print(f"Running crawl: {' \n'.join(crawlCommand)}")
        crawlProcess = run(crawlCommand)

        if crawlProcess.returncode > 0:
            self._log.error('Crawl unsuccessful (refer to crawl.log), skipping analysis...')
            return
        else:
            self._log.info('Crawl successful')
            self._loadCachedData()


    def _generateAnnotatedScreenshot(self, generate_highlights=False) -> None:
        if not Path(self.cacheDir, 'screenshots', 'screenshot.png').is_file():
            if self.debug:
                print('Unable to generate annotated screenshot, no base screenshot available...')
                self._log.error('Unable to generate annotated screenshot, no base screenshot available.')
            return

        if len(self._annotations) > 0:
            screenshot = Image.open(Path(self.cacheDir, 'screenshots', 'screenshot.png'))
            annotatedScreenshot = ImageDraw.Draw(screenshot)
            for group in self._annotations:
                for i in self._annotations[group]:
                    for r, bbox in self._annotations[group][i].items():
                        annotatedScreenshot.rectangle(
                            xy=bbox,
                            outline='red',
                            width=2
                        )

                        textPosition = (bbox[0]+4, bbox[1], bbox[2], bbox[3])
                        annotatedScreenshot.text(textPosition, f"{self._precursors[group][i]['value']*100:.2f}%", font_size=15,
                                                fill='white', stroke_width=1, stroke_fill='black')

                # if generate_highlights:
                #     width, height = screenshot.size
                #     buffer = 24
                #     x = r['left'] - buffer
                #     if x < 0: x = 0
                #     y = r['top'] - buffer
                #     if y < 0: y = 0
                #     w = r['width'] + buffer
                #     if r['left'] + w > width: w = width
                #     h = r['height'] + buffer
                #     if r['top'] + h > height: h = height
                #     crop = screenshot.crop((x, y, w, h))
                #     cropAnnotation = ImageDraw.Draw(crop)
                #     cropAnnotation.rectangle((buffer, buffer, r['width'], r['height']), fill=None, outline="red", width=3)
                #     fileHash = hashlib.md5(image['url'].encode())
                #     cropAnnotation.save(Path(self.cacheDir, f'{fileHash.hexdigest()}.png'))

            screenshot.save(Path(self.cacheDir, 'screenshots', 'annotation.png'))
            screenshot.close()


    def _loadTemplate(self) -> None:
        uri = self.uri
        uri = uri[uri.rfind('://')+3:] if uri.rfind('://') > -1 else uri

        templatesCache = Path((Path(self.cacheDir).parent), 'templates')
        if templatesCache.is_dir():
            self._log.info('Loading templates...')
            legalTemplateKeys = ['contentArea', 'zones', 'selectors']
            for tFileName in os.listdir(templatesCache.absolute()):
                with open(Path(templatesCache, tFileName), 'r', encoding='utf-8') as tFile:
                    template = json.load(tFile)
                    try:
                        if 'urlRegex' not in template.keys() or any(not k in legalTemplateKeys for k in template.keys()):
                            raise ValueError("Template contains invalid root keys")

                        urlRegex = template['urlRegex']
                        if re.match(urlRegex, uri):
                            # Matching template found, validate content
                            if 'contentArea' in template:
                                if len(template['contentArea']) != 4 or (sorted(['t', 'b', 'l', 'r']) != sorted(template['contentArea'].keys())):
                                    raise ValueError("Invalid boundary key found for template 'contentArea'")
                                if any(type(v) != int or v < -1 for v in template['contentArea'].values()):
                                    raise ValueError("Invalid boundary value found for template 'contentArea'")

                            if 'zones' in template:
                                for i, zone in enumerate(template['zones']):
                                    if len(zone) != 5 or (sorted(['t', 'b', 'l', 'r', 'multiplier']) != sorted(zone.keys())):
                                        raise ValueError("Invalid keys found for template 'contentArea'")
                                    if any(k != 'multiplier' and (type(v) != int or v < -1) for k, v in zone.items()):
                                        raise ValueError(f'Invalid boundary value for template zone {i}')
                                    if (type(zone['multiplier']) != int or type(zone['multiplier']) != float) and zone['multiplier'] < 0.0:
                                        raise ValueError(f'Invalid multiplier value for template zone {i}')

                            if 'selectors' in template:
                                pass

                            self._log.info(f'Template applied: {tFileName}')
                            self._template = template
                            return
                    except ValueError as e:
                        self._log.error(f'Error parsing template: {e}')


    def _isMissing(self, url) -> bool:
        networkMatch = list(filter(lambda nRes: nRes['response']['url'].lower().endswith(url.lower()) and nRes['response']['status'] not in [301, 302, 303, 307, 308],
                                                self._netData['responses']))
        return True if networkMatch and networkMatch[0]['response']['status'] != 200 else False


    def _purgeBlacklistedURLs(self) -> None:
        def _isBlacklisted(url):
            if any(map(lambda blacklistedURI: url.startswith(blacklistedURI), URI_BLACKLIST)): return True

            # If not defined, check whether uri has header 'Link' containing <http://mementoweb.org/terms/donotnegotiate>; rel="type"
            # if url in self._netData['responses']:
            #     log = netData['response'][url]
            #     if 'headers' in log and 'link' in log['headers']:
            #         if log['headers']['link'] == '<http://mementoweb.org/terms/donotnegotiate>; rel="type"':
            #             return True
            return False

        if not self._crawlData: return
        for key in ['css', 'image', 'video', 'iframe']:
            filteredData = list(filter(lambda l: 'url' in l and not _isBlacklisted(l['url']), self._crawlData[key]))
            self._crawlData[key] = filteredData


    def _purgeHiddenElements(self) -> None:
        for key in ['text', 'image', 'video']:
            filteredData = list(filter(lambda i: 'visible' in i and i['visible'], self._crawlData[key]))
            self._crawlData[key] = filteredData


    # def _findBlankElements(self) -> None:
    #     self._log.info('Finding blank elements')
    #     screenshot = Image.open(Path(self.cacheDir, 'screenshots', 'screenshot.png'))
    #     for bb in self._bboxData:
    #         x1 = int(bb['x'])
    #         y1 = int(bb['y'])
    #         x2 = int(bb['x'])+int(bb['width'])
    #         y2 = int(bb['y'])+int(bb['height'])

    #         imgArea = screenshot.crop((x1, y1, x2, y2))
    #         dominantColorPercentage = utils.dominantPixelPercentage(imgArea)
    #         if dominantColorPercentage > 0.90:
    #             print('Blank element found')
    #             self._annotations.append({
    #                 'type': 'blank-element',
    #                 'xy': (x1, y1, x2, y2),
    #                 'fill': "purple",
    #                 'outline': "purple"
    #             })

    #     screenshot.close()


    def _calculatePageMetrics(self) -> None:
        self._log.info('Calculating page metrics')

        try:
            if not Path(self.cacheDir, 'screenshots', 'screenshot.png').is_file():
                if self.debug: print('Unable to load screenshot file')
                raise Exception('Unable to load screenshot file')

            screenshot = Image.open(Path(self.cacheDir, 'screenshots', 'screenshot.png'))

            # If crawl URL is from archive.org, crop out the top banner
            # INFO: Obsolete with commit c785174af5726bd5510f9d36c22f4baab70f394b
            # if 'web.archive.org/web/' in self.uri:
            #     dScr = ImageDraw.Draw(screenshot)
            #     dScr.rectangle(xy=[0, 0, screenshot.size[0], 155], fill='white')
            # ---

            pixelArray = np.array(screenshot.convert('L'))

            blackPixels = (pixelArray < 255).astype(np.uint8)
            columnPixels = blackPixels.sum(axis=0)
            rowPixels = blackPixels.sum(axis=1)

            self._pageData = {}
            self._pageData['page'] = {
                'width': screenshot.size[0],
                'height': screenshot.size[1],
                'heatmap_x': columnPixels.tolist(),
                'heatmap_y': rowPixels.tolist()
            }

            if self._template and 'contentArea' in self._template:
                tContentArea = self._template['contentArea']
                self._pageData['content'] = {
                    'left': tContentArea['left'], 'right': tContentArea['right'],
                    'top': tContentArea['top'], 'bottom': tContentArea['bottom'],
                    'width': tContentArea['width'], 'height': tContentArea['height']
                }
            else:
                content = np.where(blackPixels == np.amax(blackPixels))
                left, top = np.min(content[1]), np.min(content[0])
                right, bottom = np.max(content[1]), np.max(content[0])
                width, height = right-left, bottom-top
                self._pageData['content'] = {
                    'left': left, 'right': right,
                    'top': top, 'bottom': bottom,
                    'width': width, 'height': height
                }

            screenshot.close()
        except:
            self._log.error('Unable to calculate page metrics.', exc_info=True)
            self._pageData = None


    def _calculateJavascriptMetrics(self) -> tuple:
        self._log.info('Assessing JavaScript damage...')

        jsByteMap = []
        if Path(self.cacheDir, 'data', 'jsByteMap.jsonl').is_file():
            with open(Path(self.cacheDir, 'data', 'jsByteMap.jsonl'), mode='r', encoding='utf-8') as f:
                jsByteMap = [json.loads(line) for line in list(f)]
        else:
            self._log.info(f'No byte data available')

        for i, js in enumerate(self._crawlData['js']):
            self._precursors['js'][i] = {}

            if 'url' in js and self._isMissing(js['url']): self._precursors['js'][i]['missing'] = True

            self._precursors['js'][i]['metrics'] = {
                'usedBytes': 0,
                'totalBytes': 0
            }
            self._precursors['js'][i]['multipliers'] = {
                'net': 1.0
            }

            jsBytes = list(filter(lambda j: j['url'].endswith(js['url']), jsByteMap))
            print(f"Checking bytemap for {js['url']}")
            if len(jsBytes) > 0:
                jsBytes = jsBytes[0]
                self._precursors['js'][i]['metrics'].update({'usedBytes': jsBytes['usedBytes'], 'totalBytes': jsBytes['totalBytes']})


    def _calculateStylesheetMetrics(self) -> tuple:
        self._log.info('Assessing stylesheet damage')

        # Element locations (all along left?)
        elementPlacementLeftDominant = False

        # Pixel distribution
        p3 = int(len(self._pageData['page']['heatmap_x']) / 3)
        total = np.sum(self._pageData['page']['heatmap_x'])
        total = total if total > 0 else 1
        left = np.sum(self._pageData['page']['heatmap_x'][0:p3]) / total
        center = np.sum(self._pageData['page']['heatmap_x'][p3:p3*2]) / total
        right = np.sum(self._pageData['page']['heatmap_x'][p3*2:]) / total

        # print(f'Pixel distribution: {left:.3f} {center:.3f} {right:.3f}')
        if left > 0.7 and (left > center and center > right) or elementPlacementLeftDominant:
            self._log.info('Primary stylesheet is potentially damaged or missing')

        if total == 0:
            heatmapRatio = 0.0
        else:
            heatmapRatio = max(left, center, right)

        cssByteMap = []
        if Path(self.cacheDir, 'data', 'cssByteMap.jsonl').is_file():
            with open(Path(self.cacheDir, 'data', 'cssByteMap.jsonl'), mode='r', encoding='utf-8') as f:
                cssByteMap = [json.loads(line) for line in list(f)]
        else:
            self._log.info(f'No byte data available')

        for i, stylesheet in enumerate(self._crawlData['css']):
            cssUrl = stylesheet['url']
            rules = stylesheet['rules']
            numRules = len(rules) if len(rules) > 0 else 1
            declarations = stylesheet['totalDeclarations'] if 'totalDeclarations' in stylesheet else 0
            references = stylesheet['totalReferences'] if 'totalReferences' in stylesheet else 0

            self._precursors['css'][i] = {}

            if self._isMissing(cssUrl): self._precursors['css'][i]['missing'] = True

            self._precursors['css'][i]['metrics'] = {
                'usedBytes': 0,
                'totalBytes': 0,
                'rules': numRules,
                'references': references
            }
            self._precursors['css'][i]['multipliers'] = {
                'heatmap': heatmapRatio
            }

            cssBytes = list(filter(lambda b: b['url'].endswith(cssUrl), cssByteMap))
            if len(cssBytes) > 0:
                cssBytes = cssBytes[0]
                self._precursors['css'][i]['metrics'].update({'usedBytes': cssBytes['usedBytes'], 'totalBytes': cssBytes['usedBytes']})


    def _calculateIFrameMetrics(self) -> tuple:
        self._log.info('Assessing IFrame element metrics')

        for i, el in enumerate(self._crawlData['iframe']):
            self._precursors['iframe'][i] = {}
            self._precursors['iframe'][i]['metrics'] = [{
                'width': 0,
                'height': 0,
                'x': 0,
                'y': 0,
                'styles': 0,
            }]
            self._precursors['iframe'][i]['multipliers'] = [{
                'location': 1.0,
                'semantic': 1.0
            }]
            # self._precursors['iframe'][i]['metrics'] = [{
            #     'width': el['width'],
            #     'height': el['height'],
            #     'x': el['left']+el['width'],
            #     'y': el['top']+el['height'],
            #     'styles': len(el['styles']) if 'styles' in el else 0,
            # }]
            # self._precursors['iframe'][i]['multipliers'] = [{
            #     'location': 1.0,
            #     'semantic': 1.0
            # }]


    def _calculateTextMetrics(self) -> None:
        '''
        - Plain text - num words * size
        - Link text - size * link value * (1|0)                (1|0 = link 200 vs 404)
        - Mixed

        unicode = 2 bytes per character
        '''
        self._log.info('Calculating text element metrics')

        for i, el in enumerate(self._crawlData['text']):
            numCharacters = len(el['text'])
            numWords = len(el['text'].split(' '))
            numLinks = 0

            self._precursors['text'][i] = {}
            self._precursors['text'][i]['metrics'] = [{
                'width': el['area']['width'],
                'height': el['area']['height'],
                'x': el['area']['left']+el['area']['width'],
                'y': el['area']['top']+el['area']['height'],
                'bytes': numCharacters * 2,
                'styles': len(el['classes']) if 'classes' in el else 0,
                'characters': numCharacters,
                'words': numWords,
                'links': numLinks
            }]
            self._precursors['text'][i]['multipliers'] = [{
                'location': 1.0,
                'semantic': 1.0
            }]
            # self._precursors['image'][i]['links'] = numLinks

            # Add annotation
            # x1 = int(el['left'])
            # y1 = int(el['top'])
            # x2 = int(el['left'])+int(el['width'])
            # y2 = int(el['top'])+int(el['height'])
            # self._addAnnotation('text', i, {
            #     'type': 'text',
            #     'xy': (x1, y1, x2, y2),
            #     'outline': (0, 204, 102)
            # })


    def _calculateImageMetrics(self) -> None:
        self._log.info('Calculating image element metrics')

        # Pass 1 - check if missing and determine average size of present images
        avgImgSize = {}
        for i, el in enumerate(self._crawlData['image']):
            self._precursors['image'][i] = {}

            if self._isMissing(el['url']): self._precursors['image'][i]['missing'] = True

            self._precursors['image'][i]['metrics'] = [{
                'width': r['width'],
                'height': r['height'],
                'bytes': 0,
                'styles': len(el['classes']) if 'styles' in el else 0,
            } for r in el['area']]
            self._precursors['image'][i]['multipliers'] = [{
                'location': 1.0,
                'semantic': 1.0
            }] * len(el['area'])

            if 'missing' not in self._precursors['image'][i]:
                parentString = '.'.join(el['parents'])
                if parentString not in avgImgSize: avgImgSize[parentString] = []
                for area in el['area']:
                    avgImgSize[parentString].append((area['width'], area['height']))

        if len(avgImgSize) > 0:
            for key in avgImgSize:
                avgImgSize[key] = (sum([r[0] for r in avgImgSize[key]]) / len(avgImgSize[key]),
                                   sum([r[1] for r in avgImgSize[key]]) / len(avgImgSize[key]))
            # avgImgSizeTotal = (sum([s[0] for s in avgImgSize.values()]) / len(avgImgSize),
            #                    sum([s[0] for s in avgImgSize.values()]) / len(avgImgSize))

        # Pass 2 - Adjust image sizes for missing images
        for i, el in enumerate(self._crawlData['image']):
            if 'missing' in self._precursors['image'][i]:
                wGuess, hGuess = self._guessImageSize(el, avgImgSize)

                for ii, r in enumerate(el['area']):
                    width, height = r['width'], r['height']

                    if 'width' in el and 'height' in el and (r['width'] < el['width'] and r['height'] < el['height']):
                        # use explicit size set from DOM
                        width, height = el['width'], el['height']
                    else:
                        if r['width'] < wGuess and r['height'] < hGuess:
                            width, height = wGuess, hGuess

                    self._precursors['image'][i]['metrics'][ii].update({'width': width, 'height': height})
                    self._precursors['image'][i]['metrics'][ii].update({'x': r['left'] + int(width/2), 'y': r['top'] + int(height/2)})
                    self._precursors['image'][i]['metrics'][ii].update({'width_augment': width - r['width'], 'height_augment': height - r['height']})

                    bbox = (int(r['left']),
                            int(r['top']),
                            int(r['left'])+int(r['width']),
                            int(r['top'])+int(r['height']))
                    self._addAnnotation('image', i, bbox, ii)


    def _guessImageSize(self, el, contextSizes=None):
        # Evaluate class names
        if 'classes' in el:
            for class_ in el['classes']:
                if 'emoji' in class_:
                    return S_EMOJI[0], S_EMOJI[1]
                elif 'avatar' in class_:
                    return S_AVATAR[0], S_AVATAR[1]
                elif 'thumb' in class_ or 'thumbnail' in class_ or 'logo' in class_:
                    return S_THUMBNAIL[0], S_THUMBNAIL[1]

        # Evaluate alt text
        if 'alt' in el:
            if ' ' not in el['alt']: # Character, single word, or URL
                if ((el['alt'][0] == ':' and el['alt'][-1] == ':')  or el['alt'] in EMOJI_NAMES):
                    return (S_EMOJI[0], S_EMOJI[1])

                if el['alt'].startswith('@') or 'avatar' in el['alt']:
                    return (S_AVATAR[0], S_AVATAR[1])

                if el['alt'].startswith('http'):
                    return (S_IMAGE_SM[0], S_IMAGE_SM[1])

            else: # descriptive alt text
                altWords = el['alt'].split(' ')
                # avgWordLength = sum(len(word) for word in altWords) / len(altWords)
                return (S_IMAGE_LG[0], S_IMAGE_LG[1]) if len(altWords) > 5 else (S_IMAGE_SM[0], S_IMAGE_SM[1])

        parentString = '.'.join(el['parents'])
        if contextSizes and parentString in contextSizes:
            return (contextSizes[parentString], contextSizes[parentString])
        else:
            return S_THUMBNAIL[0], S_THUMBNAIL[1]


    def _calculateVideoMetrics(self) -> None:
        self._log.info('Calculating video element metrics')

        for i, el in enumerate(self._crawlData['video']):
            self._precursors['video'][i] = {}

            if self._isMissing(el['url']) or 'error' in el: self._precursors['video'][i]['missing'] = True

            self._precursors['video'][i]['metrics'] = [{
                'width': r['width'],
                'height': r['height'],
                'bytes': 0,
                'styles': len(el['classes']) if 'classes' in el else 0,
                'duration': 0
            } for r in el['area']]

            self._precursors['video'][i]['multipliers'] = [{
                'location': 1.0,
                'semantic': 1.0
            }] * len(el['area'])

            if 'missing' in self._precursors['video'][i]:
                for ii, r in enumerate(el['area']):
                    if r['width'] < S_VIDEO_SM[0] and r['height'] < S_VIDEO_SM[1]:
                        self._precursors['video'][i]['metrics'][ii].update({'width': S_VIDEO_SM[0], 'height': S_VIDEO_SM[1]})
                        self._precursors['video'][i]['metrics'][ii].update({'x': r['left'] + int(S_VIDEO_SM[0]/2), 'y': r['top'] + int(S_VIDEO_SM[1]/2)})
                        self._precursors['video'][i]['metrics'][ii].update({'width_augment': S_VIDEO_SM[0] - r['width'], 'height_augment': S_VIDEO_SM[1] - r['height']})

                    bbox = (int(r['left']),
                            int(r['top']),
                            int(r['left'])+int(r['width']),
                            int(r['top'])+int(r['height']))
                    # bbox = (int(r['left']),
                    #         int(r['top']),
                    #         int(r['left'])+int(self._precursors['video'][i]['metrics'][ii]['width']),
                    #         int(r['top'])+int(self._precursors['video'][i]['metrics'][ii]['height']))
                    self._addAnnotation('video', i, bbox, ii)


    def _calculateGitRepoMetrics(self) -> None:
        return


    def _calculateLocationMultipliers(self) -> None:
        pageCenterX = self._pageData['page']['width']
        pageCenterY = self._pageData['page']['height']

        p5 = int(len(self._pageData['page']['heatmap_x']) / 5)
        total = np.sum(self._pageData['page']['heatmap_x'])
        total = total if total > 0 else 1
        lDist = np.sum(self._pageData['page']['heatmap_x'][0:p5]) / total
        cDist = np.sum(self._pageData['page']['heatmap_x'][p5:p5*4]) / total
        rDist = np.sum(self._pageData['page']['heatmap_x'][p5*4:]) / total
        # print(f"{p5} {total} {lDist} {cDist} {rDist}")

        if self._template and 'zones' in self._template:
            for k in ['iframe', 'text', 'image', 'video']: 
                for i, el in enumerate(self._precursors[k]):
                    for z in self._template['zones']:
                        if k == 'image' or k == 'video':
                            for ii, bbox in enumerate(el['area']):
                                if z['b'] > 0 and bbox['y'] > z['t'] and bbox['y'] < z['b'] and bbox['x'] > z['l'] and bbox['x'] < z['r']:
                                    self._precursors[k][i]['multipliers'][ii]['location'] += (z['multiplier'] - 1)
                                elif bbox['y'] > z['t'] and bbox['x'] > z['l'] and bbox['x'] < z['r']:
                                    self._precursors[k][i]['multipliers'][ii]['location'] += (z['multiplier'] - 1)
                                if self._precursors[k][i]['multipliers'][ii]['location'] < 0:
                                    self._precursors[k][i]['multipliers'][ii]['location'] = 0.0
                        else:
                            if z['b'] > 0 and bbox['y'] > z['t'] and bbox['y'] < z['b'] and bbox['x'] > z['l'] and bbox['x'] < z['r']:
                                self._precursors[k][i]['multipliers']['location'] += (z['multiplier'] - 1)
                            elif bbox['y'] > z['t'] and bbox['x'] > z['l'] and bbox['x'] < z['r']:
                                self._precursors[k][i]['multipliers']['location'] += (z['multiplier'] - 1)
                            if self._precursors[k][i]['multipliers']['location'] < 0: self._precursors[k][i]['multipliers']['location'] = 0.0


    def _calculateSemanticMultipliers(self):
        return


    def _calculatePageDamage(self): # @audit @todo
        '''
        # Calculate how much of the content area is utilized
        # utilizedArea = 0.0
        # extraPageHeight = 0
        # categoryArea = {'iframe': 0.0, 'image': 0.0, 'video': 0.0, 'text': 0.0}

        # for c in ['iframe', 'image', 'video', 'text']:
        #     for i, dataItem in enumerate(self._crawlData[c]):
        #         height, width, area = 0.0, 0.0, 0.0
        #         if c in ['image', 'video']:
        #             for rect in dataItem['area']:
        #                 if 'augmented_width' in dataItem or 'augmented_height' in dataItem:
        #                     extraPageHeight += dataItem['augmented_height'] - rect['height']
        #                     height += rect['height']
        #                     width += rect['width']
        #                 else:
        #                     area += rect['width'] * rect['height']
        #                     height += rect['height']
        #                     width += rect['width']
        #         else:
        #             if 'augmented_width' in dataItem or 'augmented_height' in dataItem:
        #                 area = dataItem['augmented_width'] * dataItem['augmented_height']

        #             if c == 'text':
        #                 area = dataItem['width'] * dataItem['height']
        #             elif c in ['image', 'video']:
        #                 extraPageHeight += dataItem['augmented_height'] - dataItem['height']
        #                 height += dataItem['augmented_height'] - dataItem['height']
        #                 width = dataItem['width']

        #         categoryArea[c] += area
        #         utilizedArea += area

        # print(f"{self._pageData['content']['height']} + {extraPageHeight}")
        # contentArea = self._pageData['content']['width'] * (self._pageData['content']['height'] + extraPageHeight)
        # self._pageData['content']['percentage'] = utilizedArea / contentArea
        # print(f'Utilized Area: {utilizedArea} / {contentArea} ({self._pageData["content"]["percentage"]:.2f}%)')

        # for dataKey in ['iframe', 'image', 'video', 'text']:
        #     c = categoryArea[dataKey] / utilizedArea
        #     self._pageData['content'][dataKey] = c
        #     print(f'{dataKey} coverage: {c:.2f}')
        '''

        groupDamage = {'css': 0.0, 'js': 0.0, 'iframe': 0.0, 'text': 0.0, 'image': 0.0, 'video': 0.0}
        actualDamage, potentialDamage = 0.0, 0.0

        # CSS damage calculation
        for i, precursor in self._precursors['css'].items():
            metrics = precursor['metrics']
            # damage = precursor['usedBytes'] \
            #     + (metrics['usedBytes'] * (metrics['references'] / metrics['numRules']))
            # for multiplier in precursor['multipliers'].values(): damage *= multiplier
            # base = cssBytes['usedBytes'] + (cssBytes['usedBytes'] * (references / numRules)) + (cssBytes['usedBytes'] * heatmapRatio)
            # print(f'{cssUrl} | Rules: {references} / {numRules} = {refRatio:.3f} | Bytes: {cssBytes["usedBytes"]} / {cssBytes["totalBytes"]} = {byteRatio:.3f} | {cssDmg:.3f}')
            damage = metrics['usedBytes'] \
                + (metrics['usedBytes'] * (metrics['references'] / metrics['rules'])) \
                + (metrics['usedBytes'] * precursor['multipliers']['heatmap'])

            self._precursors['css'][i]['value'] = damage
            potentialDamage += damage

            if 'missing' in self._precursors['css'][i]:
                groupDamage['css'] += damage
                actualDamage += damage

        # JavaScript damage calculation
        for i, precursor in self._precursors['js'].items():
            metrics = precursor['metrics']
            damage = 0.0

            self._precursors['js'][i]['value'] = damage
            potentialDamage += damage

            if 'missing' in self._precursors['js'][i]:
                groupDamage['js'] += damage
                actualDamage += damage


        for group in ['iframe', 'text', 'image', 'video']:
            for i, precursor in self._precursors[group].items():
                elDamage = 0.0
                for ii, el in enumerate(precursor['metrics']):
                    area = el['width'] * el['height']
                    for m in precursor['multipliers'][ii].values(): area *= m
                    elDamage += area

                potentialDamage += elDamage
                self._precursors[group][i]['value'] = elDamage
                if 'missing' in self._precursors[group][i]:
                    if group == 'text': print('text is missing')
                    groupDamage[group] += elDamage
                    actualDamage += elDamage


        with open(Path(self.cacheDir, 'data', 'precursors.jsonl'), 'w') as precursorFile:
            for k in ['css', 'js', 'iframe', 'text', 'image', 'video']:
                for i in range(len(self._precursors[k])):
                    self._precursors[k][i]['type'] = k
                    self._precursors[k][i]['value'] = (self._precursors[k][i]['value'] / potentialDamage) if potentialDamage > 0.0 else 0.0
                    precursorFile.write(f'{self._precursors[k][i]}\n')

        pageDamage = (actualDamage / potentialDamage) if potentialDamage > 0.0 else 0.0
        pageDamagePercent = pageDamage * 100

        result = {}
        result['uri'] = self.uri
        result['is_archive'] = True if self.warcFile else False
        # result['urir_redirects'] = urirRedirects

        result['total_damage'] = pageDamagePercent
        result['potential_damage'] = {
            'total': potentialDamage,
            'css': sum(p['value'] for p in self._precursors['css'].values()),
            'js': sum(p['value'] for p in self._precursors['js'].values()),
            'iframe': sum(p['value'] for p in self._precursors['iframe'].values()),
            'text': sum(p['value'] for p in self._precursors['text'].values()),
            'image': sum(p['value'] for p in self._precursors['image'].values()),
            'multimedia': sum(p['value'] for p in self._precursors['video'].values()),
        }
        result['actual_damage'] = {
            'total': actualDamage,
            'css': sum(p['value'] for p in self._precursors['css'].values() if 'missing' in p),
            'js': sum(p['value'] for p in self._precursors['js'].values() if 'missing' in p),
            'iframe': sum(p['value'] for p in self._precursors['iframe'].values() if 'missing' in p),
            'text': sum(p['value'] for p in self._precursors['text'].values() if 'missing' in p),
            'image': sum(p['value'] for p in self._precursors['image'].values() if 'missing' in p),
            'multimedia': sum(p['value'] for p in self._precursors['video'].values() if 'missing' in p),
        }
        self.result = result


        with open(Path(self.cacheDir, 'result.json'), 'w') as f:
            f.write(json.dumps(result))

        print('Page   :', f'{result["total_damage"]:.2f}')
        print('CSS    :', f'{result["actual_damage"]["css"]:0.3f}')
        print('JS     :', f'{result["actual_damage"]["js"]:0.3f}')
        print('IFrame :', f'{result["actual_damage"]["iframe"]:0.3f}')
        print('Text   :', f'{result["actual_damage"]["text"]:0.3f}')
        print('Image  :', f'{result["actual_damage"]["image"]:0.3f}')
        print('Video  :', f'{result["actual_damage"]["multimedia"]:0.3f}')

        self._log.info(f'CSS    : {result["actual_damage"]["css"]:0.3f}')
        self._log.info(f'JS     : {result["actual_damage"]["js"]:0.3f}')
        self._log.info(f'IFrame : {result["actual_damage"]["iframe"]:0.3f}')
        self._log.info(f'Text   : {result["actual_damage"]["text"]:0.3f}')
        self._log.info(f'Image  : {result["actual_damage"]["image"]:0.3f}')
        self._log.info(f'Video  : {result["actual_damage"]["multimedia"]:0.3f}')
        self._log.info(f'Total page damage: {self.result["total_damage"]}')


    def analyze(self) -> None: # @audit
        if not self._crawlData or self.ignoreCache:
            self._initiatePageCrawl()

            if not self._crawlData:
                self._log.error('No crawl data available')
                return

        self._log.info(f'Beginning analysis for {self.uri}')

        self._log.info('Processing network logs')
        self._purgeBlacklistedURLs()
        self._purgeHiddenElements()
        self._log.info('Assessing page damage')
        self._calculatePageMetrics()
        if not self._pageData: return

        # Element breakdown
        numCodeElements = len(self._crawlData['css']) + len(self._crawlData['js'])
        numVisualElements = len(self._crawlData['iframe']) + len(self._crawlData['text']) + len(self._crawlData['image']) + len(self._crawlData['video'])
        totalElements = numCodeElements + numVisualElements
        totalElements = totalElements if totalElements > 0 else 1
        codeRatio = numCodeElements / totalElements
        visualRatio = numVisualElements / totalElements

        styleRules = sum([len(css['rules']) for css in self._crawlData['css']])
        # print(f'CSS Rules: {styleRules}')
        # print(f'Code: {numCodeElements}, ({codeRatio:.3f}%), Content: {numContentElements} ({visualRatio:.3f}%), Total: {totalElements}')
        # print(f"CSS: {len(self._crawlData['css'])}, JS: {len(self._crawlData['js'])}")
        # print(f"Text: {len(self._crawlData['text'])}, Image: {len(self._crawlData['image'])}, Video: {len(self._crawlData['video'])}, iFrame: {len(self._crawlData['iframe'])}")
        # print()

        '''
        javascript file number of bytes used ( / total file bytes ? )
            * successful call ratio multiplier
            * bytes retrieved from func call
            = total javascript damage
        
        css bytes used
            * number of directives
            * number of classname references
        
        element area / page content area = (base value)
            * element type multiplier
            * page location multiplier
            * template zone multiplier
            * selector multiplier
            = total element damage
        
        normalized element damage = total element damage / total damage
        '''

        '''
        given element
        - total css directives = style directives + number of directives of each classname
        - attributes
        - context:
            styles, classnames, attributes, alt, width, height, position, etc
            for each parent of element (up to topmost body) which has only one child (nested: div > div > div > el)
        - position
        - area (width, height)
        '''

        self._precursors = {}
        for k in ['css', 'js', 'iframe', 'text', 'image', 'video']:
            self._precursors[k] = {}

        self._calculateStylesheetMetrics()
        self._calculateJavascriptMetrics()

        self._calculateIFrameMetrics()
        self._calculateTextMetrics()
        self._calculateImageMetrics()
        self._calculateVideoMetrics()

        self._calculateLocationMultipliers()

        self._calculateSemanticMultipliers()

        self._calculatePageDamage()

        self._generateAnnotatedScreenshot()


    def damageScore(self) -> tuple[str, bool]:
        if self.result:
            return (self.result['error'], True) if 'error' in self.result else (str(self.result['total_damage']), False)
        else:
            return ('Page has not been analyzed', True)


    def _addAnnotation(self, group, index, bbox, r=0):
        if group not in self._annotations: self._annotations[group] = {}
        if index not in self._annotations[group]: self._annotations[group][index] = {}
        self._annotations[group][index][r] = bbox


    def _setElementBoundaries(self):
        # Set css class or custom data attribute for tables, lists, container boundaries, etc
        return


    def _deriveElementContext(self):
        return