recommenders-team / recommenders

Best Practices on Recommendation Systems
https://recommenders-team.github.io/recommenders/intro.html
MIT License
18.81k stars 3.07k forks source link

Add new URL of MIND small and MIND large #2145

Closed miguelgfierro closed 3 weeks ago

miguelgfierro commented 1 month ago

Description

Small uploaded, large still not found.

This will fix also the tests

Related Issues

2133

References

Checklist:

miguelgfierro commented 4 weeks ago
tests/data_validation/recommenders/datasets/test_mind.py ...FFFFFF.F.FFF                                                                                                                      [100%]

============================================================================================= FAILURES ==============================================================================================
________________________________________ test_mind_url[https://recodatasets.z20.web.core.windows.net/newsrec/MINDsmall_train.zip-52952752-0x8D834F2EB31BDEC] ________________________________________

url = 'https://recodatasets.z20.web.core.windows.net/newsrec/MINDsmall_train.zip', content_length = '52952752', etag = '0x8D834F2EB31BDEC'

    @pytest.mark.parametrize(
        "url, content_length, etag",
        [
            (
                "https://recodatasets.z20.web.core.windows.net/newsrec/MINDdemo_train.zip",
                "17372879",
                '"0x8D8B8AD5B233930"',
            ),  # NOTE: the z20 blob returns the etag with ""
            (
                "https://recodatasets.z20.web.core.windows.net/newsrec/MINDdemo_dev.zip",
                "10080022",
                '"0x8D8B8AD5B188839"',
            ),
            (
                "https://recodatasets.z20.web.core.windows.net/newsrec/MINDdemo_utils.zip",
                "97292694",
                '"0x8D8B8AD5B126C3B"',
            ),
            (
                "https://recodatasets.z20.web.core.windows.net/newsrec/MINDsmall_train.zip",
                "52952752",
                "0x8D834F2EB31BDEC",
            ),
            (
                "https://recodatasets.z20.web.core.windows.net/newsrec/MINDsmall_dev.zip",
                "30945572",
                "0x8D834F2EBA8D865",
            ),
            (
                "https://recodatasets.z20.web.core.windows.net/newsrec/MINDsmall_utils.zip",
                "155178106",
                "0x8D87F67F4AEB960",
            ),
            (
                "https://recodatasets.z20.web.core.windows.net/newsrec/MINDlarge_train.zip",
                "530196631",
                "0x8D8244E90C15C07",
            ),
            (
                "https://recodatasets.z20.web.core.windows.net/newsrec/MINDlarge_dev.zip",
                "103456245",
                "0x8D8244E92005849",
            ),
            (
                "https://recodatasets.z20.web.core.windows.net/newsrec/MINDlarge_utils.zip",
                "150359301",
                "0x8D87F67E6CA4364",
            ),
        ],
    )
    def test_mind_url(url, content_length, etag):
        url_headers = requests.head(url).headers
>       assert url_headers["Content-Length"] == content_length
E       AssertionError: assert '52953372' == '52952752'
E         
E         - 52952752
E         + 52953372

tests/data_validation/recommenders/datasets/test_mind.py:63: AssertionError
_________________________________________ test_mind_url[https://recodatasets.z20.web.core.windows.net/newsrec/MINDsmall_dev.zip-30945572-0x8D834F2EBA8D865] _________________________________________

url = 'https://recodatasets.z20.web.core.windows.net/newsrec/MINDsmall_dev.zip', content_length = '30945572', etag = '0x8D834F2EBA8D865'

    @pytest.mark.parametrize(
        "url, content_length, etag",
        [
            (
                "https://recodatasets.z20.web.core.windows.net/newsrec/MINDdemo_train.zip",
                "17372879",
                '"0x8D8B8AD5B233930"',
            ),  # NOTE: the z20 blob returns the etag with ""
            (
                "https://recodatasets.z20.web.core.windows.net/newsrec/MINDdemo_dev.zip",
                "10080022",
                '"0x8D8B8AD5B188839"',
            ),
            (
                "https://recodatasets.z20.web.core.windows.net/newsrec/MINDdemo_utils.zip",
                "97292694",
                '"0x8D8B8AD5B126C3B"',
            ),
            (
                "https://recodatasets.z20.web.core.windows.net/newsrec/MINDsmall_train.zip",
                "52952752",
                "0x8D834F2EB31BDEC",
            ),
            (
                "https://recodatasets.z20.web.core.windows.net/newsrec/MINDsmall_dev.zip",
                "30945572",
                "0x8D834F2EBA8D865",
            ),
            (
                "https://recodatasets.z20.web.core.windows.net/newsrec/MINDsmall_utils.zip",
                "155178106",
                "0x8D87F67F4AEB960",
            ),
            (
                "https://recodatasets.z20.web.core.windows.net/newsrec/MINDlarge_train.zip",
                "530196631",
                "0x8D8244E90C15C07",
            ),
            (
                "https://recodatasets.z20.web.core.windows.net/newsrec/MINDlarge_dev.zip",
                "103456245",
                "0x8D8244E92005849",
            ),
            (
                "https://recodatasets.z20.web.core.windows.net/newsrec/MINDlarge_utils.zip",
                "150359301",
                "0x8D87F67E6CA4364",
            ),
        ],
    )
    def test_mind_url(url, content_length, etag):
        url_headers = requests.head(url).headers
>       assert url_headers["Content-Length"] == content_length
E       AssertionError: assert '30946172' == '30945572'
E         
E         - 30945572
E         ?     ^^
E         + 30946172
E         ?     ^^

tests/data_validation/recommenders/datasets/test_mind.py:63: AssertionError
_______________________________________ test_mind_url[https://recodatasets.z20.web.core.windows.net/newsrec/MINDsmall_utils.zip-155178106-0x8D87F67F4AEB960] ________________________________________

url = 'https://recodatasets.z20.web.core.windows.net/newsrec/MINDsmall_utils.zip', content_length = '155178106', etag = '0x8D87F67F4AEB960'

    @pytest.mark.parametrize(
        "url, content_length, etag",
        [
            (
                "https://recodatasets.z20.web.core.windows.net/newsrec/MINDdemo_train.zip",
                "17372879",
                '"0x8D8B8AD5B233930"',
            ),  # NOTE: the z20 blob returns the etag with ""
            (
                "https://recodatasets.z20.web.core.windows.net/newsrec/MINDdemo_dev.zip",
                "10080022",
                '"0x8D8B8AD5B188839"',
            ),
            (
                "https://recodatasets.z20.web.core.windows.net/newsrec/MINDdemo_utils.zip",
                "97292694",
                '"0x8D8B8AD5B126C3B"',
            ),
            (
                "https://recodatasets.z20.web.core.windows.net/newsrec/MINDsmall_train.zip",
                "52952752",
                "0x8D834F2EB31BDEC",
            ),
            (
                "https://recodatasets.z20.web.core.windows.net/newsrec/MINDsmall_dev.zip",
                "30945572",
                "0x8D834F2EBA8D865",
            ),
            (
                "https://recodatasets.z20.web.core.windows.net/newsrec/MINDsmall_utils.zip",
                "155178106",
                "0x8D87F67F4AEB960",
            ),
            (
                "https://recodatasets.z20.web.core.windows.net/newsrec/MINDlarge_train.zip",
                "530196631",
                "0x8D8244E90C15C07",
            ),
            (
                "https://recodatasets.z20.web.core.windows.net/newsrec/MINDlarge_dev.zip",
                "103456245",
                "0x8D8244E92005849",
            ),
            (
                "https://recodatasets.z20.web.core.windows.net/newsrec/MINDlarge_utils.zip",
                "150359301",
                "0x8D87F67E6CA4364",
            ),
        ],
    )
    def test_mind_url(url, content_length, etag):
        url_headers = requests.head(url).headers
        assert url_headers["Content-Length"] == content_length
>       assert url_headers["ETag"] == etag
E       assert '"0x8D8B8AD5B3677C6"' == '0x8D87F67F4AEB960'
E         
E         - 0x8D87F67F4AEB960
E         + "0x8D8B8AD5B3677C6"

tests/data_validation/recommenders/datasets/test_mind.py:64: AssertionError
_______________________________________ test_mind_url[https://recodatasets.z20.web.core.windows.net/newsrec/MINDlarge_train.zip-530196631-0x8D8244E90C15C07] ________________________________________

url = 'https://recodatasets.z20.web.core.windows.net/newsrec/MINDlarge_train.zip', content_length = '530196631', etag = '0x8D8244E90C15C07'

    @pytest.mark.parametrize(
        "url, content_length, etag",
        [
            (
                "https://recodatasets.z20.web.core.windows.net/newsrec/MINDdemo_train.zip",
                "17372879",
                '"0x8D8B8AD5B233930"',
            ),  # NOTE: the z20 blob returns the etag with ""
            (
                "https://recodatasets.z20.web.core.windows.net/newsrec/MINDdemo_dev.zip",
                "10080022",
                '"0x8D8B8AD5B188839"',
            ),
            (
                "https://recodatasets.z20.web.core.windows.net/newsrec/MINDdemo_utils.zip",
                "97292694",
                '"0x8D8B8AD5B126C3B"',
            ),
            (
                "https://recodatasets.z20.web.core.windows.net/newsrec/MINDsmall_train.zip",
                "52952752",
                "0x8D834F2EB31BDEC",
            ),
            (
                "https://recodatasets.z20.web.core.windows.net/newsrec/MINDsmall_dev.zip",
                "30945572",
                "0x8D834F2EBA8D865",
            ),
            (
                "https://recodatasets.z20.web.core.windows.net/newsrec/MINDsmall_utils.zip",
                "155178106",
                "0x8D87F67F4AEB960",
            ),
            (
                "https://recodatasets.z20.web.core.windows.net/newsrec/MINDlarge_train.zip",
                "530196631",
                "0x8D8244E90C15C07",
            ),
            (
                "https://recodatasets.z20.web.core.windows.net/newsrec/MINDlarge_dev.zip",
                "103456245",
                "0x8D8244E92005849",
            ),
            (
                "https://recodatasets.z20.web.core.windows.net/newsrec/MINDlarge_utils.zip",
                "150359301",
                "0x8D87F67E6CA4364",
            ),
        ],
    )
    def test_mind_url(url, content_length, etag):
        url_headers = requests.head(url).headers
>       assert url_headers["Content-Length"] == content_length
E       AssertionError: assert '531361237' == '530196631'
E         
E         - 530196631
E         + 531361237

tests/data_validation/recommenders/datasets/test_mind.py:63: AssertionError
________________________________________ test_mind_url[https://recodatasets.z20.web.core.windows.net/newsrec/MINDlarge_dev.zip-103456245-0x8D8244E92005849] _________________________________________

url = 'https://recodatasets.z20.web.core.windows.net/newsrec/MINDlarge_dev.zip', content_length = '103456245', etag = '0x8D8244E92005849'

    @pytest.mark.parametrize(
        "url, content_length, etag",
        [
            (
                "https://recodatasets.z20.web.core.windows.net/newsrec/MINDdemo_train.zip",
                "17372879",
                '"0x8D8B8AD5B233930"',
            ),  # NOTE: the z20 blob returns the etag with ""
            (
                "https://recodatasets.z20.web.core.windows.net/newsrec/MINDdemo_dev.zip",
                "10080022",
                '"0x8D8B8AD5B188839"',
            ),
            (
                "https://recodatasets.z20.web.core.windows.net/newsrec/MINDdemo_utils.zip",
                "97292694",
                '"0x8D8B8AD5B126C3B"',
            ),
            (
                "https://recodatasets.z20.web.core.windows.net/newsrec/MINDsmall_train.zip",
                "52952752",
                "0x8D834F2EB31BDEC",
            ),
            (
                "https://recodatasets.z20.web.core.windows.net/newsrec/MINDsmall_dev.zip",
                "30945572",
                "0x8D834F2EBA8D865",
            ),
            (
                "https://recodatasets.z20.web.core.windows.net/newsrec/MINDsmall_utils.zip",
                "155178106",
                "0x8D87F67F4AEB960",
            ),
            (
                "https://recodatasets.z20.web.core.windows.net/newsrec/MINDlarge_train.zip",
                "530196631",
                "0x8D8244E90C15C07",
            ),
            (
                "https://recodatasets.z20.web.core.windows.net/newsrec/MINDlarge_dev.zip",
                "103456245",
                "0x8D8244E92005849",
            ),
            (
                "https://recodatasets.z20.web.core.windows.net/newsrec/MINDlarge_utils.zip",
                "150359301",
                "0x8D87F67E6CA4364",
            ),
        ],
    )
    def test_mind_url(url, content_length, etag):
        url_headers = requests.head(url).headers
>       assert url_headers["Content-Length"] == content_length
E       AssertionError: assert '103593383' == '103456245'
E         
E         - 103456245
E         + 103593383

tests/data_validation/recommenders/datasets/test_mind.py:63: AssertionError
_______________________________________ test_mind_url[https://recodatasets.z20.web.core.windows.net/newsrec/MINDlarge_utils.zip-150359301-0x8D87F67E6CA4364] ________________________________________

url = 'https://recodatasets.z20.web.core.windows.net/newsrec/MINDlarge_utils.zip', content_length = '150359301', etag = '0x8D87F67E6CA4364'

    @pytest.mark.parametrize(
        "url, content_length, etag",
        [
            (
                "https://recodatasets.z20.web.core.windows.net/newsrec/MINDdemo_train.zip",
                "17372879",
                '"0x8D8B8AD5B233930"',
            ),  # NOTE: the z20 blob returns the etag with ""
            (
                "https://recodatasets.z20.web.core.windows.net/newsrec/MINDdemo_dev.zip",
                "10080022",
                '"0x8D8B8AD5B188839"',
            ),
            (
                "https://recodatasets.z20.web.core.windows.net/newsrec/MINDdemo_utils.zip",
                "97292694",
                '"0x8D8B8AD5B126C3B"',
            ),
            (
                "https://recodatasets.z20.web.core.windows.net/newsrec/MINDsmall_train.zip",
                "52952752",
                "0x8D834F2EB31BDEC",
            ),
            (
                "https://recodatasets.z20.web.core.windows.net/newsrec/MINDsmall_dev.zip",
                "30945572",
                "0x8D834F2EBA8D865",
            ),
            (
                "https://recodatasets.z20.web.core.windows.net/newsrec/MINDsmall_utils.zip",
                "155178106",
                "0x8D87F67F4AEB960",
            ),
            (
                "https://recodatasets.z20.web.core.windows.net/newsrec/MINDlarge_train.zip",
                "530196631",
                "0x8D8244E90C15C07",
            ),
            (
                "https://recodatasets.z20.web.core.windows.net/newsrec/MINDlarge_dev.zip",
                "103456245",
                "0x8D8244E92005849",
            ),
            (
                "https://recodatasets.z20.web.core.windows.net/newsrec/MINDlarge_utils.zip",
                "150359301",
                "0x8D87F67E6CA4364",
            ),
        ],
    )
    def test_mind_url(url, content_length, etag):
        url_headers = requests.head(url).headers
        assert url_headers["Content-Length"] == content_length
>       assert url_headers["ETag"] == etag
E       assert '"0x8D8B8AD5B2ED4C9"' == '0x8D87F67E6CA4364'
E         
E         - 0x8D87F67E6CA4364
E         + "0x8D8B8AD5B2ED4C9"

tests/data_validation/recommenders/datasets/test_mind.py:64: AssertionError
_____________________________________________________________________________________ test_download_mind_small ______________________________________________________________________________________

tmp = '/tmp/pytest-of-u/pytest-23/tmp77ep29qd'

    def test_download_mind_small(tmp):
        train_path, valid_path = download_mind(size="small", dest_path=tmp)
        statinfo = os.stat(train_path)
>       assert statinfo.st_size == 52952752
E       assert 52953372 == 52952752
E        +  where 52953372 = os.stat_result(st_mode=33188, st_ino=225215, st_dev=2080, st_nlink=1, st_uid=1000, st_gid=1000, st_size=52953372, st_atime=1723823550, st_mtime=1723823618, st_ctime=1723823618).st_size

tests/data_validation/recommenders/datasets/test_mind.py:78: AssertionError
--------------------------------------------------------------------------------------- Captured stderr call ----------------------------------------------------------------------------------------
100%|██████████| 51.7k/51.7k [01:08<00:00, 760KB/s]  
100%|██████████| 30.2k/30.2k [00:33<00:00, 892KB/s]  
______________________________________________________________________________________ test_extract_mind_small ______________________________________________________________________________________

tmp = '/tmp/pytest-of-u/pytest-23/tmptnfvzyfb'

    def test_extract_mind_small(tmp):
        train_zip, valid_zip = download_mind(size="small", dest_path=tmp)
        train_path, valid_path = extract_mind(train_zip, valid_zip, clean_zip_file=False)

>       statinfo = os.stat(os.path.join(train_path, "behaviors.tsv"))
E       FileNotFoundError: [Errno 2] No such file or directory: 'MINDsmall_train.zip/train/behaviors.tsv'

tests/data_validation/recommenders/datasets/test_mind.py:109: FileNotFoundError
--------------------------------------------------------------------------------------- Captured stderr call ----------------------------------------------------------------------------------------
100%|██████████| 51.7k/51.7k [01:09<00:00, 749KB/s]  
100%|██████████| 30.2k/30.2k [00:30<00:00, 977KB/s]  
_____________________________________________________________________________________ test_download_mind_large ______________________________________________________________________________________

tmp_path = PosixPath('/tmp/pytest-of-u/pytest-23/test_download_mind_large0')

    def test_download_mind_large(tmp_path):
        train_path, valid_path = download_mind(size="large", dest_path=tmp_path)
        statinfo = os.stat(train_path)
>       assert statinfo.st_size == 530196631
E       assert 531361237 == 530196631
E        +  where 531361237 = os.stat_result(st_mode=33188, st_ino=225216, st_dev=2080, st_nlink=1, st_uid=1000, st_gid=1000, st_size=531361237, st_atime=1723823807, st_mtime=1723824456, st_ctime=1723824456).st_size

tests/data_validation/recommenders/datasets/test_mind.py:130: AssertionError
--------------------------------------------------------------------------------------- Captured stderr call ----------------------------------------------------------------------------------------
100%|██████████| 519k/519k [10:49<00:00, 799KB/s]   
100%|██████████| 101k/101k [02:11<00:00, 769KB/s]   
______________________________________________________________________________________ test_extract_mind_large ______________________________________________________________________________________

tmp = '/tmp/pytest-of-u/pytest-23/tmppfvp4z4c'

    def test_extract_mind_large(tmp):
        train_zip, valid_zip = download_mind(size="large", dest_path=tmp)
        train_path, valid_path = extract_mind(train_zip, valid_zip)

>       statinfo = os.stat(os.path.join(train_path, "behaviors.tsv"))
E       FileNotFoundError: [Errno 2] No such file or directory: 'MINDlarge_train.zip/train/behaviors.tsv'

tests/data_validation/recommenders/datasets/test_mind.py:139: FileNotFoundError
--------------------------------------------------------------------------------------- Captured stderr call ----------------------------------------------------------------------------------------
100%|██████████| 519k/519k [12:01<00:00, 719KB/s]   
100%|██████████| 101k/101k [02:08<00:00, 786KB/s]   
====================================================================================== short test summary info ======================================================================================
FAILED tests/data_validation/recommenders/datasets/test_mind.py::test_mind_url[https://recodatasets.z20.web.core.windows.net/newsrec/MINDsmall_train.zip-52952752-0x8D834F2EB31BDEC] - AssertionError: assert '52953372' == '52952752'
FAILED tests/data_validation/recommenders/datasets/test_mind.py::test_mind_url[https://recodatasets.z20.web.core.windows.net/newsrec/MINDsmall_dev.zip-30945572-0x8D834F2EBA8D865] - AssertionError: assert '30946172' == '30945572'
FAILED tests/data_validation/recommenders/datasets/test_mind.py::test_mind_url[https://recodatasets.z20.web.core.windows.net/newsrec/MINDsmall_utils.zip-155178106-0x8D87F67F4AEB960] - assert '"0x8D8B8AD5B3677C6"' == '0x8D87F67F4AEB960'
FAILED tests/data_validation/recommenders/datasets/test_mind.py::test_mind_url[https://recodatasets.z20.web.core.windows.net/newsrec/MINDlarge_train.zip-530196631-0x8D8244E90C15C07] - AssertionError: assert '531361237' == '530196631'
FAILED tests/data_validation/recommenders/datasets/test_mind.py::test_mind_url[https://recodatasets.z20.web.core.windows.net/newsrec/MINDlarge_dev.zip-103456245-0x8D8244E92005849] - AssertionError: assert '103593383' == '103456245'
FAILED tests/data_validation/recommenders/datasets/test_mind.py::test_mind_url[https://recodatasets.z20.web.core.windows.net/newsrec/MINDlarge_utils.zip-150359301-0x8D87F67E6CA4364] - assert '"0x8D8B8AD5B2ED4C9"' == '0x8D87F67E6CA4364'
FAILED tests/data_validation/recommenders/datasets/test_mind.py::test_download_mind_small - assert 52953372 == 52952752
FAILED tests/data_validation/recommenders/datasets/test_mind.py::test_extract_mind_small - FileNotFoundError: [Errno 2] No such file or directory: 'MINDsmall_train.zip/train/behaviors.tsv'
FAILED tests/data_validation/recommenders/datasets/test_mind.py::test_download_mind_large - assert 531361237 == 530196631
FAILED tests/data_validation/recommenders/datasets/test_mind.py::test_extract_mind_large - FileNotFoundError: [Errno 2] No such file or directory: 'MINDlarge_train.zip/train/behaviors.tsv'
============================================================================= 10 failed, 5 passed in 2020.59s (0:33:40) ======================================================================