
    \j<                         d dl Z d dlZd dlZd dlmZ d dlmZ d dlZd dlm	Z	m
Z
mZ dedeeef   ddfdZdd	ed
edefdZd Z G d d      Z G d d      Zy)    N)Path)patch)ErrorMessage_unzip_iter_validate_member	file_pathmembersreturnc                     t        j                  | d      5 }|j                         D ]  \  }}|j                  ||        	 ddd       y# 1 sw Y   yxY w)zR
    Create a ZIP file at file_path, with the given arcname->content mapping.
    wN)zipfileZipFileitemswritestr)r   r	   zfarcnamecontents        N/root/env/lib/python3.12/site-packages/nltk/test/unit/test_downloader_unzip.py	_make_zipr      sQ     
C	( *B ' 	*GWKK)	** * *s   +AAzip_pathextract_rootverbosec                 T    t        t        t        |       t        |      |            S )zj
    Convenience wrapper that runs _unzip_iter and returns the list of yielded
    messages (if any).
    r   )listr   str)r   r   r   s      r   _run_unzip_iterr      s!    
 CM3|+<gNOO    c                     | D cg c]  }t        |t              s| }}|s
J d|       dj                  d |D              }|D ]  }||v rJ d|d|         |S c c}w )z{Assert that *messages* contain at least one ``ErrorMessage`` whose
    text includes every string in *expected_substrings*.z$Expected ErrorMessage(s) containing  c              3   F   K   | ]  }t        |j                          y wN)r   message.0ms     r   	<genexpr>z"_assert_blocked.<locals>.<genexpr>"   s     91AII9s   !z	Expected z in error output: )
isinstancer   join)messagesexpected_substringsr&   err_msgscombinedss         r   _assert_blockedr/      s     $Caz!\'BCHCS;<O;RSS8xx999H  LH}K	!.@
KK}LO Ds
   AAc                      e Zd ZdZdeddfdZdeddfdZej                  j                  e
j                  j                  d      d	      deddfd
       ZdeddfdZdeddfdZdeddfdZej                  j                  e
j                  j                  d      d	      deddfd       ZdeddfdZdeddfdZdeddfdZdeddfdZy)TestSecureUnzipaT  
    Tests for the validate-then-extract strategy in ``_unzip_iter``.

    The implementation scans every member for security violations (path
    traversal, absolute paths, symlink escapes, null bytes) *before*
    extracting anything.  If any member fails validation the entire archive
    is rejected and nothing is written to disk.
    tmp_pathr
   Nc                     |dz  }|dz  }ddd}t        ||       t        ||d      }t        d |D              rJ |d	z  d
z  j                         dk(  sJ |d	z  dz  dz  j                         dk(  sJ y)z
        A ZIP with only safe, relative paths should fully extract under the
        given root, and should not yield any ErrorMessage.
        zsafe.zipextracts   hellos   world)pkg/file.txtzpkg/subdir/other.txtFr   c              3   <   K   | ]  }t        |t                y wr"   r(   r   r$   s     r   r'   zKTestSecureUnzip.test_normal_relative_paths_are_extracted.<locals>.<genexpr>B   s     Eqz!\2E   pkgzfile.txtsubdirz	other.txtN)r   r   any
read_bytes)selfr2   r   r   r	   r*   s         r   (test_normal_relative_paths_are_extractedz8TestSecureUnzip.test_normal_relative_paths_are_extracted2   s    
 j()+ %$,
 	(G$"8\5IEHEEEEu$z1==?8KKKu$x/+=IIKxWWWr   c                     |dz  }|dz  }|dz  dz  j                         }ddd}t        ||       t        ||d	      }t        |d
d       |j	                         rJ |dz  dz  j	                         rJ y)aD  
        An entry containing ``..`` that would escape the target directory
        must not be written outside the extraction root, and must cause
        _unzip_iter to yield an ErrorMessage.

        The entire archive is rejected: even safe entries must NOT be
        extracted when any member fails validation.
        zzip_slip_parent.zipr4   z..zoutside.txt   ok   evil)pkg/good.txtz../outside.txtFr   Zip Slipblockedr9   good.txtN)resolver   r   r/   exists)r=   r2   r   r   outside_targetr	   r*   s          r   8test_zip_slip_with_parent_directory_component_is_blockedzHTestSecureUnzip.test_zip_slip_with_parent_directory_component_is_blockedG   s     33)+&-=FFH "%
 	(G$"8\5I*i8!((*** !5(:5==????r   win2Absolute POSIX paths are not meaningful on Windowsreasonc                    |dz  }|dz  }t        d      dt        j                          z  }	 ddt        |      di}t	        ||       t        ||d	      }t        |d
d       |j                         rJ |dz  dz  j                         rJ 	 |j                         r	 |j                          yy# t        $ r Y yw xY w# |j                         r!	 |j                          w # t        $ r Y w w xY ww xY w)a  
        An entry with an absolute POSIX path (e.g. ``/tmp/evil``) must not be
        extracted as-is; it should not overwrite arbitrary filesystem paths,
        and should result in an ErrorMessage.

        The entire archive is rejected when any member fails validation.
        zzip_slip_abs_posix.zipr4   /tmpnltk_zip_slip_test_rB   r@   rA   Fr   rC   rD   r9   rE   N)
r   osgetpidr   r   r   r/   rG   unlinkOSError)r=   r2   r   r   absolute_targetr	   r*   s          r   1test_zip_slip_with_absolute_posix_path_is_blockedzATestSecureUnzip.test_zip_slip_with_absolute_posix_path_is_blockede   s    66)+v,+>ryy{m)LL	O$gG h(&xuMHHj)<&--/// %u,z9AACCCC%%'#**, (   %%'#**,  (sB   A C !B3 3	B?>B?C5C%$C5%	C1.C50C11C5c                    t        t        d      st        j                  d       |dz  }|dz  }|dz  }|j	                          |dz  }ddd	}t        ||       |j	                          	 t        j                  ||d
z         t        ||d      }|j                         rJ t        |dd       |dz  dz  j                         rJ y# t        $ r t        j                  d       Y fw xY w)a  
        If there is a pre-existing symlink below the extraction root that
        points outside the root, writing through that symlink should not
        be allowed to escape the root.

        The entire archive is rejected when any member fails validation.
        symlinkz'Symlinks not supported on this platformzzip_slip_symlink.zipr4   outside_dirzevil.txtr@   rA   )rB   zdir_link/evil.txtdir_linkz/Symlink creation not permitted on this platformFr   zSymlink escaperD   r9   rE   N)hasattrrQ   pytestskipmkdirr   rX   rT   r   rG   r/   )r=   r2   r   r   rY   rH   r	   r*   s           r   :test_entries_resolved_outside_root_are_blocked_via_symlinkzJTestSecureUnzip.test_entries_resolved_outside_root_are_blocked_via_symlink   s     r9%KKAB44)+.$z1 "!(
 	(G$	KJJ{L:$=> #8\5I!((***"2I> !5(:5==????  	KKKIJ	Ks   ,C C,+C,c                     |dz  }|j                  d       |dz  }t        ||d      }t        d |D              sJ |j                         rt        |j	                               rJ yy)z
        A corrupt or non-zip file should cause _unzip_iter to yield an
        ErrorMessage instead of raising an unhandled exception.
        znot_a_zip.txts   this is not a zip archiver4   Fr   c              3   <   K   | ]  }t        |t                y wr"   r7   r$   s     r   r'   zGTestSecureUnzip.test_bad_zipfile_yields_errormessage.<locals>.<genexpr>   s     A1:a.Ar8   N)write_bytesr   r;   rG   iterdirr=   r2   r   r   r*   s        r   $test_bad_zipfile_yields_errormessagez4TestSecureUnzip.test_bad_zipfile_yields_errormessage   st    
 o-9:)+"8\5IAAAAA <//12222 !r   c                 H   |dz  }|dz  }t        |ddd       ddg}t        d|	      5  t        d
      5 }t        ||d      }ddd       ddd       t        dd       j	                          |dz  dz  j                         rJ y# 1 sw Y   GxY w# 1 sw Y   KxY w)a  
        A member name containing a null byte must be rejected.  Null bytes
        can cause path truncation on some platforms, so they are never
        legitimate in archive entry names.

        The entire archive is rejected when any member fails validation.

        Note: CPython's zipfile module truncates names at null bytes on
        read, so we patch ``namelist()`` to simulate a library that
        preserves them.
        znull_byte.zipr4   r@   rA   )rB   zpkg/evil.txtrB   pkg/evil .txt(nltk.downloader.zipfile.ZipFile.namelist)return_value'nltk.downloader.zipfile.ZipFile.extractFr   N	Null byterD   r9   rE   )r   r   r   r/   assert_not_calledrG   )r=   r2   r   r   poisoned_namesmock_extractr*   s          r   (test_null_byte_in_member_name_is_blockedz8TestSecureUnzip.test_null_byte_in_member_name_is_blocked   s     o-)+(UGLM(*<=6'
 	N 5
	N
 &xuMH	N 	N 	+y9&&( 5(:5==????	N 	N 	N 	Ns"   BBBB	BB!c                 D   |dz  }|dz  }t        d      dt        j                          z  }	 ddddt        |      d	d
di}t	        ||       t        ||d      }t        |d      }t        |      dk\  sJ d       |j                         rJ |j                         rt        |j                               rJ |j                         r	 |j                          yy# t        $ r Y yw xY w# |j                         r!	 |j                          w # t        $ r Y w w xY ww xY w)a  
        An archive that combines several different violation types (path
        traversal and absolute path) must report every violation and
        extract nothing.  This verifies that the validation scan does not
        short-circuit after the first bad entry.
        zmulti_violation.zipr4   rO   nltk_multi_viol_test_z
data/a.txts   aaaz../traversal.txts   evil1s   evil2z
data/b.txts   bbbFr   rC      z#Expected at least two ErrorMessagesN)r   rQ   rR   r   r   r   r/   lenrG   r;   rc   rS   rT   )r=   r2   r   r   rU   r	   r*   r,   s           r   @test_multiple_violation_types_all_reported_and_nothing_extractedzPTestSecureUnzip.test_multiple_violation_types_all_reported_and_nothing_extracted   s:    33)+v,+@)NN	f"HO$hf	G h(&xuMH&x<Hx=A%L'LL%&--///""$|335666%%'#**, (   %%'#**,  (sB   BC, C 	C)(C),D>DD	DDDDc                 4   |dz  }|dz  }|dz  }t        |ddi       |j                          |j                  d       t        dt	        d      	      5  t        ||d
      }ddd       t        d       |j                         dk(  sJ y# 1 sw Y   +xY w)z{
        If extraction fails mid-stream, pre-existing content under the
        extraction root must be preserved.
        zextract_error.zipr4   zalready_there.txtr5      datas   keep-merj   zsimulated extraction failureside_effectFr   NzExtraction error)r   r^   rb   r   rT   r   r/   r<   )r=   r2   r   r   existing_filer*   s         r   >test_extraction_error_does_not_delete_preexisting_root_contentzNTestSecureUnzip.test_extraction_error_does_not_delete_preexisting_root_content  s     11)+$'::(^W56!!*-5 >?
 	N 'xuMH		N 	"45'')Z777	N 	Ns   BBc                    |dz  }|dz  }t        |ddi       t        dt        d            5  t        ||d	      }d
d
d
       t	        d       |j                         rt        |j                               rJ y
y
# 1 sw Y   BxY w)z
        If ``zf.namelist()`` itself raises (e.g. corrupted central directory),
        an ErrorMessage must be yielded and the zip file must be closed.
        znamelist_bomb.zipr4   r5   rv   rh   zcorrupted central directoryrw   Fr   N)r   r   RuntimeErrorr   r/   rG   r;   rc   rd   s        r   (test_namelist_raises_yields_errormessagez8TestSecureUnzip.test_namelist_raises_yields_errormessage'  s    
 11)+(^W566$%BC
 	N 'xuMH		N 	"?@ <//12222 !	N 	Ns   A??Bc                     |dz  }|dz  }ddi}t        ||       t        ||d       |j                         }d|j                  v sJ y)	z
        When verbose=True, _unzip_iter should write a status line to stdout.
        This checks that existing user-visible behaviour is preserved.
        zverbose.zipr4   r5   rv   Tr   	UnzippingN)r   r   
readouterrout)r=   capsysr2   r   r   r	   captureds          r   (test_unzip_iter_verbose_writes_to_stdoutz8TestSecureUnzip.test_unzip_iter_verbose_writes_to_stdout<  sX    
 m+)+!7+(G$,=$$&hll***r   c                     |dz  }|j                  d       |dz  }t        ||d       |j                         }d|j                  v sJ |j                  j	                  d      sJ y)	z
        When verbose=True and the file is not a valid zip, the output line
        must still be terminated with a newline so the terminal is left in
        a clean state.
        zcorrupt.txts	   not a zipr4   Tr   r   
N)rb   r   r   r   endswith)r=   r   r2   r   r   r   s         r   "test_verbose_output_on_corrupt_zipz2TestSecureUnzip.test_verbose_output_on_corrupt_zipK  sj     m+\*)+,=$$&hll***||$$T***r   )__name__
__module____qualname____doc__r   r>   rI   r\   markskipifsysplatform
startswithrV   r_   re   ro   rt   rz   r}   r   r    r   r   r1   r1   (   sb   X X$ X*@@	@< [[&C  !$ !SW !	!F%@%@	%@N3T 3d 3 @ @$ @@ [[&C  %%	%	%N88	823 3$ 3*+ +RV ++4 +D +r   r1   c                   &   e Zd ZdZd Zd Zd Zd Zej                  j                  ej                  j                  d      d      d	        Zej                  j                  ej                  j                  d       d
      d        Zd Zd Zy)TestValidateMemberzADirect unit tests for ``_validate_member`` path validation logic.c                 <    t        |dz        }t        d|      J y )Nrootr5   r   r   )r=   r2   r   s      r   test_safe_relative_path_passesz1TestValidateMember.test_safe_relative_path_passes^  s$    8f$%5===r   c                 H    t        |dz        }t        d|      }|d|v sJ y )Nr   ../evil.txtrC   r   r=   r2   r   results       r   test_parent_traversal_blockedz0TestValidateMember.test_parent_traversal_blockedb  s3    8f$%!-6!jF&:::&:r   c                 H    t        |dz        }t        d|      }|d|v sJ y )Nr   za/b/c/../../../../evil.txtrC   r   r   s       r   $test_deeply_nested_traversal_blockedz7TestValidateMember.test_deeply_nested_traversal_blockedg  s4    8f$%!">E!jF&:::&:r   c                 H    t        |dz        }t        d|      }|d|v sJ y )Nr   rg   rk   r   r   s       r   test_null_byte_blockedz)TestValidateMember.test_null_byte_blockedl  s4    8f$%!"4d;!kV&;;;&;r   rJ   rK   rL   c                 H    t        |dz        }t        d|      }|d|v sJ y )Nr   z/etc/passwdrC   r   r   s       r    test_absolute_posix_path_blockedz3TestValidateMember.test_absolute_posix_path_blockedq  s5    
 8f$%!-6!jF&:::&:r   z1Drive-letter paths are only meaningful on Windowsc                 H    t        |dz        }t        d|      }|d|v sJ y )Nr   zC:\Windows\evil.txtrC   r   r   s       r   !test_windows_drive_letter_blockedz4TestValidateMember.test_windows_drive_letter_blockedz  s6    
 8f$%!"94@!jF&:::&:r   c                     t        |dz        }t        d|      }t        j                  j	                  d      r	|d|v sJ y|J y)zOn Windows, backslash is a path separator so ``..\evil.txt``
        is a traversal attack.  On POSIX, backslash is a literal filename
        character and the member name is harmless.r   z..\evil.txtrJ   NrC   )r   r   r   r   r   r   s       r   test_backslash_traversalz+TestValidateMember.test_backslash_traversal  sQ     8f$%!.$7<<""5)%**>>>*>>!>r   c                     t        |dz        }t        j                  j                  fd}t	        d|      5  t        d|      J t        d|      }|d|v sJ 	 ddd       y# 1 sw Y   yxY w)	zSimulate case-folding normcase (as on Windows) to verify that
        _validate_member applies it to both target and prefix paths.Rootc                 0     |       j                         S r"   )lower)poriginal_normcases    r   case_folding_normcasez^TestValidateMember.test_normcase_is_applied_to_path_comparisons.<locals>.case_folding_normcase  s    $Q'--//r   z nltk.downloader.os.path.normcaserw   r5   Nr   rC   )r   rQ   pathnormcaser   r   )r=   r2   r   r   r   r   s        @r   ,test_normcase_is_applied_to_path_comparisonsz?TestValidateMember.test_normcase_is_applied_to_path_comparisons  s     8f$%GG,,	0 .-
 	? $ND9AAA%mT:F%**>>>*>	? 	? 	?s   #A))A2N)r   r   r   r   r   r   r   r   r\   r   r   r   r   r   r   r   r   r   r   r   r   r   r   [  s    K>;
;
<
 [[&C  ;	;
 [[LL##E**B  ;	;
	"?r   r   )F)rQ   r   r   pathlibr   unittest.mockr   r\   nltk.downloaderr   r   r   dictr   bytesr   boolr   r/   r1   r   r   r   r   <module>r      s|    	 
     G G* *S%Z(8 *T *Pd P$ P Pp+ p+f	C? C?r   