
    \ji!                     \    d Z ddlmZmZmZ dZ G d d      Z G d d      Z G d d	      Zy
)z
Unit tests for nltk.tokenize.texttiling.

Marti A. Hearst (1997) "TextTiling: Segmenting Text into Multi-Paragraph
Subtopic Passages", Computational Linguistics, 23(1), pp. 33-64.
https://aclanthology.org/J97-1003.pdf
    )BLOCK_COMPARISONVOCABULARY_INTRODUCTIONTextTilingTokenizera  
The quick brown fox jumped over the lazy dog. The dog was sleeping
in the sun. It was a beautiful day outside in the forest. The fox
continued on its way through the dense woodland. Birds were singing
in the trees and rabbits scurried through the underbrush. The animals
were enjoying the warm afternoon sunshine.

Meanwhile, in the city, the stock market was experiencing significant
changes. Traders were anxious about the new economic policies. The
financial sector was particularly affected by the recent changes in
interest rates. Banks reported lower quarterly earnings. Investors
were pulling their money out of risky assets. The central bank was
considering raising rates again.

In other news, scientists discovered a new species of deep-sea fish.
The fish was found at a depth of three thousand meters below the
surface. Marine biologists were excited about the discovery. The
specimen was brought to the laboratory for further study. Researchers
believe the species has been living in the deep ocean for millions
of years.

The weather forecast predicted rain for the weekend across the region.
Farmers were hoping for the precipitation to help their growing crops.
The drought had lasted several months with no relief in sight. Water
reservoirs were at critically low levels. Irrigation systems were
being strained beyond their capacity.

The quick brown fox jumped over the lazy dog. The dog was sleeping
in the sun. It was a beautiful day outside in the forest. The fox
continued on its way through the dense woodland. Birds were singing
in the trees and rabbits scurried through the underbrush. The animals
were enjoying the warm afternoon sunshine.

Meanwhile, in the city, the stock market was experiencing significant
changes. Traders were anxious about the new economic policies. The
financial sector was particularly affected by the recent changes in
interest rates. Banks reported lower quarterly earnings. Investors
were pulling their money out of risky assets. The central bank was
considering raising rates again.

In other news, scientists discovered a new species of deep-sea fish.
The fish was found at a depth of three thousand meters below the
surface. Marine biologists were excited about the discovery. The
specimen was brought to the laboratory for further study. Researchers
believe the species has been living in the deep ocean for millions
of years.

The weather forecast predicted rain for the weekend across the region.
Farmers were hoping for the precipitation to help their growing crops.
The drought had lasted several months with no relief in sight. Water
reservoirs were at critically low levels. Irrigation systems were
being strained beyond their capacity.

The quick brown fox jumped over the lazy dog. The dog was sleeping
in the sun. It was a beautiful day outside in the forest. The fox
continued on its way through the dense woodland. Birds were singing
in the trees and rabbits scurried through the underbrush. The animals
were enjoying the warm afternoon sunshine.

Meanwhile, in the city, the stock market was experiencing significant
changes. Traders were anxious about the new economic policies. The
financial sector was particularly affected by the recent changes in
interest rates. Banks reported lower quarterly earnings. Investors
were pulling their money out of risky assets. The central bank was
considering raising rates again.

In other news, scientists discovered a new species of deep-sea fish.
The fish was found at a depth of three thousand meters below the
surface. Marine biologists were excited about the discovery. The
specimen was brought to the laboratory for further study. Researchers
believe the species has been living in the deep ocean for millions
of years.

The weather forecast predicted rain for the weekend across the region.
Farmers were hoping for the precipitation to help their growing crops.
The drought had lasted several months with no relief in sight. Water
reservoirs were at critically low levels. Irrigation systems were
being strained beyond their capacity.
c                   .    e Zd ZdZd Zd Zd Zd Zd Zy)TestTextTilingBlockComparisonzATests for the block comparison method (Hearst 1997, Section 3.2).c                     t        ddt              }|j                  t              }t	        |t
              sJ t        |      dk\  sJ y)z7Block comparison should return a list of text segments.
      wksimilarity_method   N)r   r   tokenizeMULTI_TOPIC_TEXT
isinstancelistlenselfttsegmentss      H/root/env/lib/python3.12/site-packages/nltk/test/unit/test_texttiling.pytest_returns_segmentsz3TestTextTilingBlockComparison.test_returns_segments7   sB     2>NO;;/0(D)))8}!!!    c                     t        ddt              }|j                  t              }dj	                  |      t        k(  sJ yz;Concatenated segments should reconstruct the original text.r	   r
   r    N)r   r   r   r   joinr   s      r   test_segments_cover_full_textz;TestTextTilingBlockComparison.test_segments_cover_full_text>   s8     2>NO;;/0wwx $4444r   c                    t        ddt        d      }|j                  t              }t	        |      dk(  sJ |\  }}}}t	        |      dkD  sJ t	        |      dkD  sJ t	        |      dkD  sJ t	        |      dkD  sJ yzDIn demo mode, should return (gap_scores, smooth, depth, boundaries).r	   r
   Tr   r   r   	demo_mode   r   N)r   r   r   r   r   r   r   result
gap_scoressmooth_scoresdepth_scores
boundariess          r   test_demo_mode_returns_scoresz;TestTextTilingBlockComparison.test_demo_mode_returns_scoresD   s     A)9T
 -.6{a>D;
M<:"""=!A%%%< 1$$$:"""r   c                     t        ddt        d      }|j                  t              \  }}}}|D ]  }d|cxk  rdk  rn J d| d        y	)
zDBlock comparison gap scores should be in [0, 1] (cosine similarity).r	   r
   Tr#           g      ?Score z out of [0, 1] rangeN)r   r   r   r   r   r   r(   _scores        r   $test_gap_scores_between_zero_and_onezBTestTextTilingBlockComparison.test_gap_scores_between_zero_and_oneQ   sc     A)9T
 !kk*:;
Aq! 	ME%&3&L&7K(LL&	Mr   c                     d}dj                  |gdz        dz   }t        ddt              }|j                  |      }t	        |      dk  sJ y)z;Repeating the same paragraph should produce few boundaries.1The cat sat on the mat. The dog chased the ball. 

r	   r
   r   N)r   r   r   r   r   )r   	paragraphrepeatedr   r   s        r   $test_homogeneous_text_few_boundarieszBTestTextTilingBlockComparison.test_homogeneous_text_few_boundariesZ   sP    G	KKb 01V; 2>NO;;x(8}!!!r   N)	__name__
__module____qualname____doc__r   r    r,   r3   r9    r   r   r   r   4   s    K"5#M"r   r   c                   .    e Zd ZdZd Zd Zd Zd Zd Zy)TestTextTilingVocabIntroductionzHTests for the vocabulary introduction method (Hearst 1997, Section 3.2).c                     t        ddt              }|j                  t              }t	        |t
              sJ t        |      dk\  sJ y)z>Vocabulary introduction should return a list of text segments.r	   r
   r   r   N)r   r   r   r   r   r   r   r   s      r   r   z5TestTextTilingVocabIntroduction.test_returns_segmentsh   sB     2>UV;;/0(D)))8}!!!r   c                     t        ddt              }|j                  t              }dj	                  |      t        k(  sJ yr   )r   r   r   r   r   r   s      r   r    z=TestTextTilingVocabIntroduction.test_segments_cover_full_texto   s8     2>UV;;/0wwx $4444r   c                    t        ddt        d      }|j                  t              }t	        |      dk(  sJ |\  }}}}t	        |      dkD  sJ t	        |      dkD  sJ t	        |      dkD  sJ t	        |      dkD  sJ yr"   )r   r   r   r   r   r&   s          r   r,   z=TestTextTilingVocabIntroduction.test_demo_mode_returns_scoresu   s     A)@D
 -.6{a>D;
M<:"""=!A%%%< 1$$$:"""r   c                     t        ddt        d      }|j                  t              \  }}}}|D ]  }|dk\  r	J d| d        y)	z6Vocabulary introduction scores should be non-negative.r	   r
   Tr#   r.   r/   z is negativeN)r   r   r   r   r0   s        r   test_gap_scores_non_negativez<TestTextTilingVocabIntroduction.test_gap_scores_non_negative   sY     A)@D
 !kk*:;
Aq! 	>EC<=6%!==<	>r   c                    d}dj                  |gdz        dz   }t        ddt        d      }|j                  |      \  }}}}|t	        |      dz  d }|rt        |      t	        |      z  nd	}|d
k  sJ d| d       y)zCRepeating the same paragraph should produce low vocab intro scores.r5   r6   r	   r
   Tr#      Nr   g333333?zAverage later score z too high for repeated text)r   r   r   r   r   sum)r   r7   r8   r   r(   r1   later_scores	avg_laters           r    test_homogeneous_text_low_scoresz@TestTextTilingVocabIntroduction.test_homogeneous_text_low_scores   s    G	KKb 01V; A)@D
 !kk(3
Aq! "#j/Q"6"89=IC%L(99q		I!),GH	Ir   N)	r:   r;   r<   r=   r   r    r,   rE   rK   r>   r   r   r@   r@   e   s    R"5#>Ir   r@   c                   (    e Zd ZdZd Zd Zd Zd Zy)TestTextTilingCommonzTests common to both methods.c                     t         t        fD ]>  }t        dd|      }|j                  t              }t        |      dkD  r5J d| d        y)zCBoth methods should find at least one boundary in multi-topic text.r	   r
   r   r   zMethod z( found no boundaries in multi-topic textN)r   r   r   r   r   r   )r   methodr   r   s       r   !test_both_methods_find_boundariesz6TestTextTilingCommon.test_both_methods_find_boundaries   s^    ')@A 	JF$rQ&IB{{#34HH!J HIJ!		Jr   c                 r    t        d      }	 |j                  t               J d       # t        $ r Y yw xY w)z2Invalid similarity method should raise ValueError.invalid)r   Expected ValueErrorNr   r   r   
ValueErrorr   r   s     r   test_invalid_similarity_methodz3TestTextTilingCommon.test_invalid_similarity_method   s;     9=	KK()///5 		   * 	66c                 r    t        d      }	 |j                  t               J d       # t        $ r Y yw xY w)z1Invalid smoothing method should raise ValueError.rR   )smoothing_methodrS   NrT   rV   s     r   test_invalid_smoothing_methodz2TestTextTilingCommon.test_invalid_smoothing_method   s;     )<	KK()///5 		rX   c                     t         t        fD ]l  }t        dd|      }|j                  t              }t        dd|d      }|j                  t              \  }}}}t        |      }|t        |      dz
  k\  rlJ  y)z:Number of boundaries should be number of segments minus 1.r	   r
   r   Tr#   rG   N)r   r   r   r   r   rH   r   )r   rO   r   r   tt_demor1   r+   num_boundariess           r   $test_boundary_count_matches_segmentsz9TestTextTilingCommon.test_boundary_count_matches_segments   s    ')@A 		7F$rQ&IB{{#34H)VtG #*"2"23C"DAq!Z _N!S]Q%6666		7r   N)r:   r;   r<   r=   rP   rW   r[   r_   r>   r   r   rM   rM      s    'J7r   rM   N)	r=   nltk.tokenize.texttilingr   r   r   r   r   r@   rM   r>   r   r   <module>ra      sA     @." ."b4I 4In)7 )7r   