o
    ˜»áfÐ$  ã                   @   s\   d Z ddlZddlZddlZdgZe dd¡ZG dd„ dƒZG dd„ dƒZ	G d	d
„ d
ƒZ
dS )a%   robotparser.py

    Copyright (C) 2000  Bastian Kleineidam

    You can choose between two licenses when using this package:
    1) GNU GPLv2
    2) PSF license for Python 2.2

    The robots.txt Exclusion Protocol is implemented as specified in
    http://www.robotstxt.org/norobots-rfc.txt
é    NÚRobotFileParserÚRequestRatezrequests secondsc                   @   sr   e Zd ZdZddd„Zdd„ Zdd„ Zd	d
„ Zdd„ Zdd„ Z	dd„ Z
dd„ Zdd„ Zdd„ Zdd„ Zdd„ ZdS )r   zs This class provides a set of methods to read, parse and answer
    questions about a single robots.txt file.

    Ú c                 C   s2   g | _ g | _d | _d| _d| _|  |¡ d| _d S )NFr   )ÚentriesÚsitemapsÚdefault_entryÚdisallow_allÚ	allow_allÚset_urlÚlast_checked©ÚselfÚurl© r   ú)/usr/lib/python3.10/urllib/robotparser.pyÚ__init__   s   

zRobotFileParser.__init__c                 C   s   | j S )z·Returns the time the robots.txt file was last fetched.

        This is useful for long-running web spiders that need to
        check for new robots.txt files periodically.

        )r   ©r   r   r   r   Úmtime%   s   zRobotFileParser.mtimec                 C   s   ddl }|  ¡ | _dS )zYSets the time the robots.txt file was last fetched to the
        current time.

        r   N)Útimer   )r   r   r   r   r   Úmodified.   s   zRobotFileParser.modifiedc                 C   s&   || _ tj |¡dd… \| _| _dS )z,Sets the URL referring to a robots.txt file.é   é   N)r   ÚurllibÚparseÚurlparseÚhostÚpathr   r   r   r   r
   6   s    zRobotFileParser.set_urlc              
   C   s´   z	t j | j¡}W n@ t jjyI } z2|jdv rd| _n|jdkr0|jdk r>d| _W Y d}~dS W Y d}~dS W Y d}~dS W Y d}~dS d}~ww | 	¡ }|  
| d¡ ¡ ¡ dS )z4Reads the robots.txt URL and feeds it to the parser.)i‘  i“  Ti  iô  Nzutf-8)r   ÚrequestÚurlopenr   ÚerrorÚ	HTTPErrorÚcoder   r	   Úreadr   ÚdecodeÚ
splitlines)r   ÚfÚerrÚrawr   r   r   r"   ;   s   
ÿÿ€ýzRobotFileParser.readc                 C   s2   d|j v r| jd u r|| _d S d S | j |¡ d S ©NÚ*)Ú
useragentsr   r   Úappend)r   Úentryr   r   r   Ú
_add_entryH   s
   


þzRobotFileParser._add_entryc                 C   sJ  d}t ƒ }|  ¡  |D ]
}|s(|dkrt ƒ }d}n|dkr(|  |¡ t ƒ }d}| d¡}|dkr7|d|… }| ¡ }|s>q| dd¡}t|ƒdkr|d  ¡  ¡ |d< tj	 
|d  ¡ ¡|d< |d dkr~|dkrs|  |¡ t ƒ }|j |d ¡ d}q|d dkr–|dkr•|j t|d d	ƒ¡ d}q|d d
kr®|dkr­|j t|d dƒ¡ d}q|d dkrÊ|dkrÉ|d  ¡  ¡ rÇt|d ƒ|_d}q|d dkr|dkr|d  d¡}t|ƒdkr|d  ¡  ¡ r|d  ¡  ¡ rtt|d ƒt|d ƒƒ|_d}q|d dkr| j |d ¡ q|dkr#|  |¡ dS dS )z”Parse the input lines from a robots.txt file.

        We allow that a user-agent: line is not preceded by
        one or more blank lines.
        r   r   é   ú#Nú:z
user-agentÚdisallowFÚallowTzcrawl-delayzrequest-rateú/Úsitemap)ÚEntryr   r-   ÚfindÚstripÚsplitÚlenÚlowerr   r   Úunquoter*   r+   Ú	rulelinesÚRuleLineÚisdigitÚintÚdelayr   Úreq_rater   )r   ÚlinesÚstater,   ÚlineÚiÚnumbersr   r   r   r   Q   sv   




€€€
 ÿ€€
ÿzRobotFileParser.parsec                 C   s    | j rdS | jr
dS | jsdS tj tj |¡¡}tj dd|j|j	|j
|jf¡}tj |¡}|s3d}| jD ]}| |¡rD| |¡  S q6| jrN| j |¡S dS )z=using the parsed robots.txt decide if useragent can fetch urlFTr   r3   )r   r	   r   r   r   r   r;   Ú
urlunparser   ÚparamsÚqueryÚfragmentÚquoter   Ú
applies_toÚ	allowancer   )r   Ú	useragentr   Ú
parsed_urlr,   r   r   r   Ú	can_fetchš   s(   ÿ

ÿzRobotFileParser.can_fetchc                 C   ó>   |   ¡ sd S | jD ]}| |¡r|j  S q	| jr| jjS d S ©N)r   r   rL   r@   r   ©r   rN   r,   r   r   r   Úcrawl_delay·   ó   


ÿzRobotFileParser.crawl_delayc                 C   rQ   rR   )r   r   rL   rA   r   rS   r   r   r   Úrequest_rateÁ   rU   zRobotFileParser.request_ratec                 C   s   | j sd S | j S rR   )r   r   r   r   r   Ú	site_mapsË   s   zRobotFileParser.site_mapsc                 C   s,   | j }| jd ur|| jg }d tt|ƒ¡S )Nz

)r   r   ÚjoinÚmapÚstr)r   r   r   r   r   Ú__str__Ð   s   
zRobotFileParser.__str__N)r   )Ú__name__Ú
__module__Ú__qualname__Ú__doc__r   r   r   r
   r"   r-   r   rP   rT   rV   rW   r[   r   r   r   r   r      s    
			I

c                   @   s(   e Zd ZdZdd„ Zdd„ Zdd„ ZdS )	r=   zoA rule line is a single "Allow:" (allowance==True) or "Disallow:"
       (allowance==False) followed by a path.c                 C   s<   |dkr|sd}t j t j |¡¡}t j |¡| _|| _d S )Nr   T)r   r   rG   r   rK   r   rM   )r   r   rM   r   r   r   r   Ú   s
   
zRuleLine.__init__c                 C   s   | j dkp
| | j ¡S r(   )r   Ú
startswith)r   Úfilenamer   r   r   rL   â   s   zRuleLine.applies_toc                 C   s   | j rdndd | j S )NÚAllowÚDisallowz: )rM   r   r   r   r   r   r[   å   s   zRuleLine.__str__N)r\   r]   r^   r_   r   rL   r[   r   r   r   r   r=   ×   s
    r=   c                   @   s0   e Zd ZdZdd„ Zdd„ Zdd„ Zdd	„ Zd
S )r5   z?An entry has one or more user-agents and zero or more rulelinesc                 C   s   g | _ g | _d | _d | _d S rR   )r*   r<   r@   rA   r   r   r   r   r   ë   s   
zEntry.__init__c                 C   s‚   g }| j D ]
}| d|› ¡ q| jd ur| d| j› ¡ | jd ur3| j}| d|j› d|j› ¡ | tt| j	ƒ¡ d 
|¡S )NzUser-agent: zCrawl-delay: zRequest-rate: r3   Ú
)r*   r+   r@   rA   ÚrequestsÚsecondsÚextendrY   rZ   r<   rX   )r   ÚretÚagentÚrater   r   r   r[   ñ   s   



zEntry.__str__c                 C   sF   |  d¡d  ¡ }| jD ]}|dkr dS | ¡ }||v r  dS qdS )z2check if this entry applies to the specified agentr3   r   r)   TF)r8   r:   r*   )r   rN   ri   r   r   r   rL   ý   s   
ÿzEntry.applies_toc                 C   s$   | j D ]}| |¡r|j  S qdS )zZPreconditions:
        - our agent applies to this entry
        - filename is URL decodedT)r<   rL   rM   )r   ra   rD   r   r   r   rM   
  s
   


ÿzEntry.allowanceN)r\   r]   r^   r_   r   r[   rL   rM   r   r   r   r   r5   é   s    r5   )r_   ÚcollectionsÚurllib.parser   Úurllib.requestÚ__all__Ú
namedtupler   r   r=   r5   r   r   r   r   Ú<module>   s     B