I have tried using RecursiveURLLoader to crawl a website for its content , however it still return documents in that subdomain. I have asked langchat and also checked out the documentation but I still have no idea on why. Thanks for your help!
from langchain_community.document_loaders import RecursiveUrlLoader
from langchain_community.document_transformers import Html2TextTransformer
import re
from bs4 import BeautifulSoup
domain = "https://www.cuhkmc.hk/"
loader = RecursiveUrlLoader(
home_domain, # root domain URL
max_depth=3, # depth of recursion
base_url=home_domain, # restrict crawling to this base URL and subdomains if configured
encoding= "utf-8", # encoding for the content
extractor=bs4_extractor, # custom extractor function
exclude_dirs=("/assets/", "/static/", "/assets/shared/"), # exclude URLs containing these paths
)
docs = loader.load()
html2text = Html2TextTransformer()
docs_transformed = html2text.transform_documents(docs)
docx_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
docs_transformed = docx_splitter.split_documents(docs_transformed)
print(docs_transformed)
print(f"Total docs loaded: {len(docs_transformed)}")
# One of the results:
# Document(metadata={'source': 'https://www.cuhkmc.hk/assets/shared/fonts/lato-regular.woff2', 'content_type': 'application/octet-stream'}, page_content='wOF2 w *� v� � ?FFTM&���X` �J �e ��$��s6$�$�T\n�z�Z�M?webf[q�kw9�V5��o����n��I��ZW�ݎ\n�[�������2�6e&������L�ѲZ6cv��N�5�/�f�i���b%$:��:�]���p���=�G\u07b2M�uI�۶#{�����;]ѥ��\\ң%�����^�p�k��Q��lVԄ�c2?��b\n��eAM����AN��Л����#^����Y�XF�E���%�?�2m`A��f=���Β�=~x�xK(�v)f�Q��y���|�=3��a�$�"&O;�r2B�T�}�]�9s�U�#ED�+�ċx�L�9̔��\n�͞���b *X\n�X�H%R����@,0rV-�.�n.Z��k7�nw�j�oW\x7fy�v���U�j\\m�}��2fLz$����W��q!=b���Nr�ëu�SGM�+���v�c���B���_�_�\n�a�> u��J2Hk{F^/�y�F��T�Su�mS4N��� ���e��䈌�3�2n���}���L�2ֵ/5��}M�@�`I�\n/�&pV;�YR��U�zj�&�����# I\x8cS�{aP("%������w�V�����[L ��0 �Ȳ�h�Y��ц��;�7� c�sO�\nPDd�<�/�N:�;d�1D�����|2��\x7f A�0�&�@�IH�?��gE���JW�U�\n�������o�e(9�_k�{1Y��N��T���2%\x7f���l��I��&)�\ue7a77^��Ti�ML�������;\n#j<�l������8N�p��[�d���ͩS�����9��}�K���u�4�|�Z����pI,;�b�l��z��m{;���B\nɠI!HJf�l��m�R� $e�8#�ڬ=�WSr�t��\n4�����غ���ަ��+�*�Ud��(\'7��ۢ�Ъ�����|�P(����W�%�=\nA�""�������;���;)=o?%��ꨪ���#""F�����������%O �]���m�Ê��^3��\u0378�<�������^�"u\n(R��ݏK�r�f�k����\'���NJpi�\\�"�[k�1�x��uEu! ���K��7 ��W=3���'),