diff --git a/examples/ncf/README.md b/examples/ncf/README.md index eac9b0c5ed7963c996019a7899c9eb0d7c57dfbc..e8286a223714567fe2d01a38c151a69af2de927e 100644 --- a/examples/ncf/README.md +++ b/examples/ncf/README.md @@ -25,8 +25,6 @@ The model trains on binary information about whether or not a user interacted wi ## Setup -### Steps to configure machine - * Install `unzip` and `curl` ```bash @@ -41,14 +39,21 @@ The model trains on binary information about whether or not a user interacted wi pip install -e . ``` -* Download and verify data +* Obtain the ml-20m dataset ```bash cd <distiller-repo-root>/examples/ncf + # Creates ml-20.zip - source ../download_dataset.sh + source download_dataset.sh + # Confirms the MD5 checksum of ml-20.zip - source ../verify_dataset.sh + source verify_dataset.sh + + # Extracts the dataset into a sub-directory named 'ml-20m' + # During the last step the script might appear to hang, + # This is normal, it finishes after a few minutes + source extract_dataset.sh ``` ## Running the Sample diff --git a/examples/ncf/extract_dataset.sh b/examples/ncf/extract_dataset.sh new file mode 100644 index 0000000000000000000000000000000000000000..7ab3a5658660205c872337d78ab85e685deba2c0 --- /dev/null +++ b/examples/ncf/extract_dataset.sh @@ -0,0 +1,11 @@ +#!/bin/bash + +echo "unzip ml-20m.zip" +if unzip -u ml-20m.zip +then + echo "Start processing ml-20m/ratings.csv" + python convert.py ml-20m/ratings.csv ml-20m --negatives 999 +else + echo "Problem unzipping ml-20.zip" + echo "Please run 'download_data.sh && verify_datset.sh' first" +fi