B
    /b-                 @   s  d Z ddlmZ ddlZddlZddlZddlZddlZddlZddlZddl	Z	ddl
mZ ddlmZmZmZmZ ddlmZ ddlmZmZ ed	d
Zejddddd ejdddd ejdddddd ejdddd ejddd ejdd d ejd!d"d ejd#d$d ejd%d&d'd( ejd)d*d'd( ejddd+d,d-d-d-d-g g d.
 d/d0 ZG d1d2 d2eZd:d3d4Zd5d6 Zd7d8 Zed9kre  dS );z(arc2warc - convert one arc to a new warc    )print_functionN)OptionParser   )	ArcRecord
WarcRecordMixedRecordexpand_files)warc_datetime_str)ResponseMessageRequestMessagez%prog [options] arc (arc ...))usagez-oz--outputoutputzoutput warc file)desthelpz-lz--limitlimit)r   z-Zz--gzipgzip
store_truecompress)r   actionr   z-Lz--log-level	log_levelz--descriptiondescriptionz
--operatoroperatorz--publisher	publisherz
--audienceaudiencez
--resourceresourceappend)r   r   z
--responseresponseinfoF )
Zoutput_directoryr   r   r   r   r   r   r   r   r   c             C   s*   t t }|| }|  | o(| S )N)r
   r   feedcloseZcomplete)contentmessage	remainder r$   2/usr/lib/python3.7/site-packages/hanzo/arc2warc.pyis_http_response,   s    

r&   c               @   s:   e Zd ZdddZedd Zdd	 Zd
d Zdd ZdS )ArcTransformerN   software: hanzo.arc2warc
r$   c             C   s(   d | _ || _d| _|| _|| _|| _d S )Ns   WARC/1.0)warcinfo_idoutput_filenameversionwarcinfo_fields	resources	responses)selfr*   r,   r-   r.   r$   r$   r%   __init__4   s    zArcTransformer.__init__c             C   s&   dt t|  dd  dS )Nz<urn:uuid:%s>r       ascii)uuidUUIDhashlibsha1	hexdigestencode)textr$   r$   r%   make_warc_uuid<   s    zArcTransformer.make_warc_uuidc             C   s"   |j dkr| |S | |S d S )Ns   filedesc)typeconvert_filedescconvert_record)r/   recordr$   r$   r%   convert@   s    

zArcTransformer.convertc             C   sb  t tj }| |j| }tjtjftj|ftj	|fg}| j
rT|tj| j
f d| jf}t||| jd}|jrt|jdkrtj|jd d dd}ntj|jd d dd}t |}n|}| |j|j d }|j}	|	d	r|	d
d  }	tjtjftj|ftj|ftj|	ftj	|ftj|fg}
d| f}t|
|| jd}|| _||fS )Ns   application/warc-fields)headersr!   r+      r2   z%Y%m%d%H%M%S   z%Y%m%ds   -metas   filedesc://   s   application/arc)r	   datetimenowr:   urlr   TYPEZWARCINFOIDDATEr*   r   ZFILENAMEr,   r+   datelenstrptimedecode
startswithMETADATAZCONCURRENT_TOURLWARCINFO_IDrawr)   )r/   r>   Zwarcinfo_dater)   Zwarcinfo_headersZwarcinfo_contentZ
inforecordZwarcmeta_dateZwarcmeta_idZwarcmeta_urlZwarcmeta_headersZwarcmeta_contentZ
metarecordr$   r$   r%   r<   G   s:    

 

zArcTransformer.convert_filedescc       
         s  |  |j|j }tj|ftj|jftj| jfg}|jrytj	|j
dd}W q tk
r|   tj	|j
dd}Y qX n
tj }|tj}|r| }|dkr|tj|f |tjt|f |j\}}| sd}|j  t fdd| jD rtj}nt fdd| jD r2tj}nn d	r\t|rTd
}tj}ntj}nD dr|drt|
dd|krtj}ntj}ntj}|tj|f t|||f| j d}	|	fS )Nr2   z%Y%m%d%H%M%Sz%Y%m%ds   0.0.0.0s   application/octet-streamc             3   s   | ]}  |V  qd S )N)rN   ).0p)rF   r$   r%   	<genexpr>   s    z0ArcTransformer.convert_record.<locals>.<genexpr>c             3   s   | ]}  |V  qd S )N)rN   )rS   rT   )rF   r$   r%   rU      s    s   https!   application/http;msgtype=responses   dnss   text/dnsignore)r@   r!   r+   )!r:   rF   rJ   r   rH   rP   rQ   r)   rD   rL   rM   
ValueErrorrE   
get_headerr   ZIPstripr   Z
IP_ADDRESSrI   r	   r!   loweranyr-   ZRESOURCEr.   ZRESPONSErN   r&   strrG   r+   )
r/   r>   Zwarc_idr@   rJ   ipcontent_typer!   Zrecord_type
warcrecordr$   )rF   r%   r=   x   sJ    




"zArcTransformer.convert_record)Nr(   r$   r$   )	__name__
__module____qualname__r0   staticmethodr:   r?   r<   r=   r$   r$   r$   r%   r'   3   s
   
1r'   c          	   C   s4   d ddt  d|  d| d| d| gdS )	Nz
zsoftware: hanzo.arc2warczhostname: %szdescription: %szoperator: %szpublisher: %szaudience: %szutf-8)joinsocketgethostnamer8   )r   r   r   r   r$   r$   r%   r,      s    
r,   c          	   C   sH  t j| dd  d\}}ytjj}W n tk
r>   tj}Y nX |jrdt|jd}|jdrdd|_	t
|dk rzt d t|j|j|j|jd}t|j||j|j}xt|D ]}tj|dd	}zrxl|D ]d}t|trtd
|jtjd |g}	ntd|jtjd ||}	x|	D ]}
|
j||j	d qW qW W d |  X qW dS )Nr   )argsabz.gzTzno imput warc file(s))r   r   r   r   auto)filenamer   z   WARC)filezARC    )r   r   )parser
parse_argssysstdoutbufferAttributeErrorr   openendswithr   rK   errorr,   r   r   r   r   r'   r   r   r   r   Zopen_archive
isinstancer   printrF   stderrr?   Zwrite_tor    )argvoptionsZinput_filesoutZwarcinfoarcnamefhr>   Zwarcsr_   r$   r$   r%   main   s<    





r~   c               C   s   t tt j d S )N)rn   exitr~   rx   r$   r$   r$   r%   run   s    r   __main__)r   r   r   r   ) __doc__
__future__r   osrn   r5   r3   os.pathrD   re   optparser   Z	warctoolsr   r   r   r   Zwarctools.warcr	   Z	httptoolsr
   r   rl   
add_optionset_defaultsr&   objectr'   r,   r~   r   r`   r$   r$   r$   r%   <module>   sJ   


 
(
