import pandas as pd
import numpy as np
import json

import findspark
findspark.init()

from pyspark.sql import SparkSession


spark = SparkSession \
        .builder \
        .appName("Pyspark ALS") \
        .getOrCreate()


sc = spark.sparkContext


from pyspark.sql import functions as F
from pyspark.sql import types as T


customSchema = T.StructType([
    T.StructField("userId",T.IntegerType(),True),
    T.StructField("movieId",T.IntegerType(),True),
    T.StructField("rating",T.FloatType(),True),
    T.StructField("timestamp",T.LongType(),True),
])


df = spark.read.csv(
    "./dataset/datas/ml-latest-small/ratings_1m.csv",
    header =True,
    schema = customSchema
)


df.show(5)

+------+-------+------+---------+
|userId|movieId|rating|timestamp|
+------+-------+------+---------+
|     1|   1193|   5.0|978300760|
|     1|    661|   3.0|978302109|
|     1|    914|   3.0|978301968|
|     1|   3408|   4.0|978300275|
|     1|   2355|   5.0|978824291|
+------+-------+------+---------+
only showing top 5 rows


df.select("userId").distinct().count()

6040


df.select("movieId").distinct().count()

3706


df.printSchema()

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: float (nullable = true)
 |-- timestamp: long (nullable = true)


from pyspark.ml.recommendation import ALS


help(ALS)

Help on class ALS in module pyspark.ml.recommendation:

class ALS(pyspark.ml.wrapper.JavaEstimator, _ALSParams, pyspark.ml.util.JavaMLWritable, pyspark.ml.util.JavaMLReadable)
 |  Alternating Least Squares (ALS) matrix factorization.
 |  
 |  ALS attempts to estimate the ratings matrix `R` as the product of
 |  two lower-rank matrices, `X` and `Y`, i.e. `X * Yt = R`. Typically
 |  these approximations are called 'factor' matrices. The general
 |  approach is iterative. During each iteration, one of the factor
 |  matrices is held constant, while the other is solved for using least
 |  squares. The newly-solved factor matrix is then held constant while
 |  solving for the other factor matrix.
 |  
 |  This is a blocked implementation of the ALS factorization algorithm
 |  that groups the two sets of factors (referred to as "users" and
 |  "products") into blocks and reduces communication by only sending
 |  one copy of each user vector to each product block on each
 |  iteration, and only for the product blocks that need that user's
 |  feature vector. This is achieved by pre-computing some information
 |  about the ratings matrix to determine the "out-links" of each user
 |  (which blocks of products it will contribute to) and "in-link"
 |  information for each product (which of the feature vectors it
 |  receives from each user block it will depend on). This allows us to
 |  send only an array of feature vectors between each user block and
 |  product block, and have the product block find the users' ratings
 |  and update the products based on these messages.
 |  
 |  For implicit preference data, the algorithm used is based on
 |  `"Collaborative Filtering for Implicit Feedback Datasets",
 |  <https://doi.org/10.1109/ICDM.2008.22>`_, adapted for the blocked
 |  approach used here.
 |  
 |  Essentially instead of finding the low-rank approximations to the
 |  rating matrix `R`, this finds the approximations for a preference
 |  matrix `P` where the elements of `P` are 1 if r > 0 and 0 if r <= 0.
 |  The ratings then act as 'confidence' values related to strength of
 |  indicated user preferences rather than explicit ratings given to
 |  items.
 |  
 |  .. versionadded:: 1.4.0
 |  
 |  Notes
 |  -----
 |  The input rating dataframe to the ALS implementation should be deterministic.
 |  Nondeterministic data can cause failure during fitting ALS model.
 |  For example, an order-sensitive operation like sampling after a repartition makes
 |  dataframe output nondeterministic, like `df.repartition(2).sample(False, 0.5, 1618)`.
 |  Checkpointing sampled dataframe or adding a sort before sampling can help make the
 |  dataframe deterministic.
 |  
 |  Examples
 |  --------
 |  >>> df = spark.createDataFrame(
 |  ...     [(0, 0, 4.0), (0, 1, 2.0), (1, 1, 3.0), (1, 2, 4.0), (2, 1, 1.0), (2, 2, 5.0)],
 |  ...     ["user", "item", "rating"])
 |  >>> als = ALS(rank=10, seed=0)
 |  >>> als.setMaxIter(5)
 |  ALS...
 |  >>> als.getMaxIter()
 |  5
 |  >>> als.setRegParam(0.1)
 |  ALS...
 |  >>> als.getRegParam()
 |  0.1
 |  >>> als.clear(als.regParam)
 |  >>> model = als.fit(df)
 |  >>> model.getBlockSize()
 |  4096
 |  >>> model.getUserCol()
 |  'user'
 |  >>> model.setUserCol("user")
 |  ALSModel...
 |  >>> model.getItemCol()
 |  'item'
 |  >>> model.setPredictionCol("newPrediction")
 |  ALS...
 |  >>> model.rank
 |  10
 |  >>> model.userFactors.orderBy("id").collect()
 |  [Row(id=0, features=[...]), Row(id=1, ...), Row(id=2, ...)]
 |  >>> test = spark.createDataFrame([(0, 2), (1, 0), (2, 0)], ["user", "item"])
 |  >>> predictions = sorted(model.transform(test).collect(), key=lambda r: r[0])
 |  >>> predictions[0]
 |  Row(user=0, item=2, newPrediction=0.692910...)
 |  >>> predictions[1]
 |  Row(user=1, item=0, newPrediction=3.473569...)
 |  >>> predictions[2]
 |  Row(user=2, item=0, newPrediction=-0.899198...)
 |  >>> user_recs = model.recommendForAllUsers(3)
 |  >>> user_recs.where(user_recs.user == 0)        .select("recommendations.item", "recommendations.rating").collect()
 |  [Row(item=[0, 1, 2], rating=[3.910..., 1.997..., 0.692...])]
 |  >>> item_recs = model.recommendForAllItems(3)
 |  >>> item_recs.where(item_recs.item == 2)        .select("recommendations.user", "recommendations.rating").collect()
 |  [Row(user=[2, 1, 0], rating=[4.892..., 3.991..., 0.692...])]
 |  >>> user_subset = df.where(df.user == 2)
 |  >>> user_subset_recs = model.recommendForUserSubset(user_subset, 3)
 |  >>> user_subset_recs.select("recommendations.item", "recommendations.rating").first()
 |  Row(item=[2, 1, 0], rating=[4.892..., 1.076..., -0.899...])
 |  >>> item_subset = df.where(df.item == 0)
 |  >>> item_subset_recs = model.recommendForItemSubset(item_subset, 3)
 |  >>> item_subset_recs.select("recommendations.user", "recommendations.rating").first()
 |  Row(user=[0, 1, 2], rating=[3.910..., 3.473..., -0.899...])
 |  >>> als_path = temp_path + "/als"
 |  >>> als.save(als_path)
 |  >>> als2 = ALS.load(als_path)
 |  >>> als.getMaxIter()
 |  5
 |  >>> model_path = temp_path + "/als_model"
 |  >>> model.save(model_path)
 |  >>> model2 = ALSModel.load(model_path)
 |  >>> model.rank == model2.rank
 |  True
 |  >>> sorted(model.userFactors.collect()) == sorted(model2.userFactors.collect())
 |  True
 |  >>> sorted(model.itemFactors.collect()) == sorted(model2.itemFactors.collect())
 |  True
 |  >>> model.transform(test).take(1) == model2.transform(test).take(1)
 |  True
 |  
 |  Method resolution order:
 |      ALS
 |      pyspark.ml.wrapper.JavaEstimator
 |      pyspark.ml.wrapper.JavaParams
 |      pyspark.ml.wrapper.JavaWrapper
 |      pyspark.ml.base.Estimator
 |      _ALSParams
 |      _ALSModelParams
 |      pyspark.ml.param.shared.HasPredictionCol
 |      pyspark.ml.param.shared.HasBlockSize
 |      pyspark.ml.param.shared.HasMaxIter
 |      pyspark.ml.param.shared.HasRegParam
 |      pyspark.ml.param.shared.HasCheckpointInterval
 |      pyspark.ml.param.shared.HasSeed
 |      pyspark.ml.param.Params
 |      pyspark.ml.util.Identifiable
 |      pyspark.ml.util.JavaMLWritable
 |      pyspark.ml.util.MLWritable
 |      pyspark.ml.util.JavaMLReadable
 |      pyspark.ml.util.MLReadable
 |      builtins.object
 |  
 |  Methods defined here:
 |  
 |  __init__(self, *, rank=10, maxIter=10, regParam=0.1, numUserBlocks=10, numItemBlocks=10, implicitPrefs=False, alpha=1.0, userCol='user', itemCol='item', seed=None, ratingCol='rating', nonnegative=False, checkpointInterval=10, intermediateStorageLevel='MEMORY_AND_DISK', finalStorageLevel='MEMORY_AND_DISK', coldStartStrategy='nan', blockSize=4096)
 |      __init__(self, \*, rank=10, maxIter=10, regParam=0.1, numUserBlocks=10,
 |               numItemBlocks=10, implicitPrefs=False, alpha=1.0, userCol="user", itemCol="item",                  seed=None, ratingCol="rating", nonnegative=False, checkpointInterval=10,                  intermediateStorageLevel="MEMORY_AND_DISK",                  finalStorageLevel="MEMORY_AND_DISK", coldStartStrategy="nan", blockSize=4096)
 |  
 |  setAlpha(self, value)
 |      Sets the value of :py:attr:`alpha`.
 |      
 |      .. versionadded:: 1.4.0
 |  
 |  setBlockSize(self, value)
 |      Sets the value of :py:attr:`blockSize`.
 |      
 |      .. versionadded:: 3.0.0
 |  
 |  setCheckpointInterval(self, value)
 |      Sets the value of :py:attr:`checkpointInterval`.
 |  
 |  setColdStartStrategy(self, value)
 |      Sets the value of :py:attr:`coldStartStrategy`.
 |      
 |      .. versionadded:: 2.2.0
 |  
 |  setFinalStorageLevel(self, value)
 |      Sets the value of :py:attr:`finalStorageLevel`.
 |      
 |      .. versionadded:: 2.0.0
 |  
 |  setImplicitPrefs(self, value)
 |      Sets the value of :py:attr:`implicitPrefs`.
 |      
 |      .. versionadded:: 1.4.0
 |  
 |  setIntermediateStorageLevel(self, value)
 |      Sets the value of :py:attr:`intermediateStorageLevel`.
 |      
 |      .. versionadded:: 2.0.0
 |  
 |  setItemCol(self, value)
 |      Sets the value of :py:attr:`itemCol`.
 |      
 |      .. versionadded:: 1.4.0
 |  
 |  setMaxIter(self, value)
 |      Sets the value of :py:attr:`maxIter`.
 |  
 |  setNonnegative(self, value)
 |      Sets the value of :py:attr:`nonnegative`.
 |      
 |      .. versionadded:: 1.4.0
 |  
 |  setNumBlocks(self, value)
 |      Sets both :py:attr:`numUserBlocks` and :py:attr:`numItemBlocks` to the specific value.
 |      
 |      .. versionadded:: 1.4.0
 |  
 |  setNumItemBlocks(self, value)
 |      Sets the value of :py:attr:`numItemBlocks`.
 |      
 |      .. versionadded:: 1.4.0
 |  
 |  setNumUserBlocks(self, value)
 |      Sets the value of :py:attr:`numUserBlocks`.
 |      
 |      .. versionadded:: 1.4.0
 |  
 |  setParams(self, *, rank=10, maxIter=10, regParam=0.1, numUserBlocks=10, numItemBlocks=10, implicitPrefs=False, alpha=1.0, userCol='user', itemCol='item', seed=None, ratingCol='rating', nonnegative=False, checkpointInterval=10, intermediateStorageLevel='MEMORY_AND_DISK', finalStorageLevel='MEMORY_AND_DISK', coldStartStrategy='nan', blockSize=4096)
 |      setParams(self, \*, rank=10, maxIter=10, regParam=0.1, numUserBlocks=10,                  numItemBlocks=10, implicitPrefs=False, alpha=1.0, userCol="user", itemCol="item",                  seed=None, ratingCol="rating", nonnegative=False, checkpointInterval=10,                  intermediateStorageLevel="MEMORY_AND_DISK",                  finalStorageLevel="MEMORY_AND_DISK", coldStartStrategy="nan", blockSize=4096)
 |      Sets params for ALS.
 |      
 |      .. versionadded:: 1.4.0
 |  
 |  setPredictionCol(self, value)
 |      Sets the value of :py:attr:`predictionCol`.
 |  
 |  setRank(self, value)
 |      Sets the value of :py:attr:`rank`.
 |      
 |      .. versionadded:: 1.4.0
 |  
 |  setRatingCol(self, value)
 |      Sets the value of :py:attr:`ratingCol`.
 |      
 |      .. versionadded:: 1.4.0
 |  
 |  setRegParam(self, value)
 |      Sets the value of :py:attr:`regParam`.
 |  
 |  setSeed(self, value)
 |      Sets the value of :py:attr:`seed`.
 |  
 |  setUserCol(self, value)
 |      Sets the value of :py:attr:`userCol`.
 |      
 |      .. versionadded:: 1.4.0
 |  
 |  ----------------------------------------------------------------------
 |  Data and other attributes defined here:
 |  
 |  __abstractmethods__ = frozenset()
 |  
 |  ----------------------------------------------------------------------
 |  Methods inherited from pyspark.ml.wrapper.JavaParams:
 |  
 |  clear(self, param)
 |      Clears a param from the param map if it has been explicitly set.
 |  
 |  copy(self, extra=None)
 |      Creates a copy of this instance with the same uid and some
 |      extra params. This implementation first calls Params.copy and
 |      then make a copy of the companion Java pipeline component with
 |      extra params. So both the Python wrapper and the Java pipeline
 |      component get copied.
 |      
 |      Parameters
 |      ----------
 |      extra : dict, optional
 |          Extra parameters to copy to the new instance
 |      
 |      Returns
 |      -------
 |      :py:class:`JavaParams`
 |          Copy of this instance
 |  
 |  ----------------------------------------------------------------------
 |  Methods inherited from pyspark.ml.wrapper.JavaWrapper:
 |  
 |  __del__(self)
 |  
 |  ----------------------------------------------------------------------
 |  Data descriptors inherited from pyspark.ml.wrapper.JavaWrapper:
 |  
 |  __dict__
 |      dictionary for instance variables (if defined)
 |  
 |  __weakref__
 |      list of weak references to the object (if defined)
 |  
 |  ----------------------------------------------------------------------
 |  Methods inherited from pyspark.ml.base.Estimator:
 |  
 |  fit(self, dataset, params=None)
 |      Fits a model to the input dataset with optional parameters.
 |      
 |      .. versionadded:: 1.3.0
 |      
 |      Parameters
 |      ----------
 |      dataset : :py:class:`pyspark.sql.DataFrame`
 |          input dataset.
 |      params : dict or list or tuple, optional
 |          an optional param map that overrides embedded params. If a list/tuple of
 |          param maps is given, this calls fit on each param map and returns a list of
 |          models.
 |      
 |      Returns
 |      -------
 |      :py:class:`Transformer` or a list of :py:class:`Transformer`
 |          fitted model(s)
 |  
 |  fitMultiple(self, dataset, paramMaps)
 |      Fits a model to the input dataset for each param map in `paramMaps`.
 |      
 |      .. versionadded:: 2.3.0
 |      
 |      Parameters
 |      ----------
 |      dataset : :py:class:`pyspark.sql.DataFrame`
 |          input dataset.
 |      paramMaps : :py:class:`collections.abc.Sequence`
 |          A Sequence of param maps.
 |      
 |      Returns
 |      -------
 |      :py:class:`_FitMultipleIterator`
 |          A thread safe iterable which contains one model for each param map. Each
 |          call to `next(modelIterator)` will return `(index, model)` where model was fit
 |          using `paramMaps[index]`. `index` values may not be sequential.
 |  
 |  ----------------------------------------------------------------------
 |  Methods inherited from _ALSParams:
 |  
 |  getAlpha(self)
 |      Gets the value of alpha or its default value.
 |      
 |      .. versionadded:: 1.4.0
 |  
 |  getFinalStorageLevel(self)
 |      Gets the value of finalStorageLevel or its default value.
 |      
 |      .. versionadded:: 2.0.0
 |  
 |  getImplicitPrefs(self)
 |      Gets the value of implicitPrefs or its default value.
 |      
 |      .. versionadded:: 1.4.0
 |  
 |  getIntermediateStorageLevel(self)
 |      Gets the value of intermediateStorageLevel or its default value.
 |      
 |      .. versionadded:: 2.0.0
 |  
 |  getNonnegative(self)
 |      Gets the value of nonnegative or its default value.
 |      
 |      .. versionadded:: 1.4.0
 |  
 |  getNumItemBlocks(self)
 |      Gets the value of numItemBlocks or its default value.
 |      
 |      .. versionadded:: 1.4.0
 |  
 |  getNumUserBlocks(self)
 |      Gets the value of numUserBlocks or its default value.
 |      
 |      .. versionadded:: 1.4.0
 |  
 |  getRank(self)
 |      Gets the value of rank or its default value.
 |      
 |      .. versionadded:: 1.4.0
 |  
 |  getRatingCol(self)
 |      Gets the value of ratingCol or its default value.
 |      
 |      .. versionadded:: 1.4.0
 |  
 |  ----------------------------------------------------------------------
 |  Data and other attributes inherited from _ALSParams:
 |  
 |  alpha = Param(parent='undefined', name='alpha', doc='alpha for implici...
 |  
 |  finalStorageLevel = Param(parent='undefined', name='finalStorageLevel'...
 |  
 |  implicitPrefs = Param(parent='undefined', name='implicitPrefs', doc='w...
 |  
 |  intermediateStorageLevel = Param(parent='undefined', name='intermediat...
 |  
 |  nonnegative = Param(parent='undefined', name='nonnegative', do...to us...
 |  
 |  numItemBlocks = Param(parent='undefined', name='numItemBlocks', doc='n...
 |  
 |  numUserBlocks = Param(parent='undefined', name='numUserBlocks', doc='n...
 |  
 |  rank = Param(parent='undefined', name='rank', doc='rank of the factori...
 |  
 |  ratingCol = Param(parent='undefined', name='ratingCol', doc='column na...
 |  
 |  ----------------------------------------------------------------------
 |  Methods inherited from _ALSModelParams:
 |  
 |  getColdStartStrategy(self)
 |      Gets the value of coldStartStrategy or its default value.
 |      
 |      .. versionadded:: 2.2.0
 |  
 |  getItemCol(self)
 |      Gets the value of itemCol or its default value.
 |      
 |      .. versionadded:: 1.4.0
 |  
 |  getUserCol(self)
 |      Gets the value of userCol or its default value.
 |      
 |      .. versionadded:: 1.4.0
 |  
 |  ----------------------------------------------------------------------
 |  Data and other attributes inherited from _ALSModelParams:
 |  
 |  coldStartStrategy = Param(parent='undefined', name='coldStartStrateg.....
 |  
 |  itemCol = Param(parent='undefined', name='itemCol', doc='c...ds. Ids m...
 |  
 |  userCol = Param(parent='undefined', name='userCol', doc='c...ds. Ids m...
 |  
 |  ----------------------------------------------------------------------
 |  Methods inherited from pyspark.ml.param.shared.HasPredictionCol:
 |  
 |  getPredictionCol(self)
 |      Gets the value of predictionCol or its default value.
 |  
 |  ----------------------------------------------------------------------
 |  Data and other attributes inherited from pyspark.ml.param.shared.HasPredictionCol:
 |  
 |  predictionCol = Param(parent='undefined', name='predictionCol', doc='p...
 |  
 |  ----------------------------------------------------------------------
 |  Methods inherited from pyspark.ml.param.shared.HasBlockSize:
 |  
 |  getBlockSize(self)
 |      Gets the value of blockSize or its default value.
 |  
 |  ----------------------------------------------------------------------
 |  Data and other attributes inherited from pyspark.ml.param.shared.HasBlockSize:
 |  
 |  blockSize = Param(parent='undefined', name='blockSize', doc=...n then ...
 |  
 |  ----------------------------------------------------------------------
 |  Methods inherited from pyspark.ml.param.shared.HasMaxIter:
 |  
 |  getMaxIter(self)
 |      Gets the value of maxIter or its default value.
 |  
 |  ----------------------------------------------------------------------
 |  Data and other attributes inherited from pyspark.ml.param.shared.HasMaxIter:
 |  
 |  maxIter = Param(parent='undefined', name='maxIter', doc='max number of...
 |  
 |  ----------------------------------------------------------------------
 |  Methods inherited from pyspark.ml.param.shared.HasRegParam:
 |  
 |  getRegParam(self)
 |      Gets the value of regParam or its default value.
 |  
 |  ----------------------------------------------------------------------
 |  Data and other attributes inherited from pyspark.ml.param.shared.HasRegParam:
 |  
 |  regParam = Param(parent='undefined', name='regParam', doc='regularizat...
 |  
 |  ----------------------------------------------------------------------
 |  Methods inherited from pyspark.ml.param.shared.HasCheckpointInterval:
 |  
 |  getCheckpointInterval(self)
 |      Gets the value of checkpointInterval or its default value.
 |  
 |  ----------------------------------------------------------------------
 |  Data and other attributes inherited from pyspark.ml.param.shared.HasCheckpointInterval:
 |  
 |  checkpointInterval = Param(parent='undefined', name='checkpointInterv....
 |  
 |  ----------------------------------------------------------------------
 |  Methods inherited from pyspark.ml.param.shared.HasSeed:
 |  
 |  getSeed(self)
 |      Gets the value of seed or its default value.
 |  
 |  ----------------------------------------------------------------------
 |  Data and other attributes inherited from pyspark.ml.param.shared.HasSeed:
 |  
 |  seed = Param(parent='undefined', name='seed', doc='random seed.')
 |  
 |  ----------------------------------------------------------------------
 |  Methods inherited from pyspark.ml.param.Params:
 |  
 |  explainParam(self, param)
 |      Explains a single param and returns its name, doc, and optional
 |      default value and user-supplied value in a string.
 |  
 |  explainParams(self)
 |      Returns the documentation of all params with their optionally
 |      default values and user-supplied values.
 |  
 |  extractParamMap(self, extra=None)
 |      Extracts the embedded default param values and user-supplied
 |      values, and then merges them with extra values from input into
 |      a flat param map, where the latter value is used if there exist
 |      conflicts, i.e., with ordering: default param values <
 |      user-supplied values < extra.
 |      
 |      Parameters
 |      ----------
 |      extra : dict, optional
 |          extra param values
 |      
 |      Returns
 |      -------
 |      dict
 |          merged param map
 |  
 |  getOrDefault(self, param)
 |      Gets the value of a param in the user-supplied param map or its
 |      default value. Raises an error if neither is set.
 |  
 |  getParam(self, paramName)
 |      Gets a param by its name.
 |  
 |  hasDefault(self, param)
 |      Checks whether a param has a default value.
 |  
 |  hasParam(self, paramName)
 |      Tests whether this instance contains a param with a given
 |      (string) name.
 |  
 |  isDefined(self, param)
 |      Checks whether a param is explicitly set by user or has
 |      a default value.
 |  
 |  isSet(self, param)
 |      Checks whether a param is explicitly set by user.
 |  
 |  set(self, param, value)
 |      Sets a parameter in the embedded param map.
 |  
 |  ----------------------------------------------------------------------
 |  Data descriptors inherited from pyspark.ml.param.Params:
 |  
 |  params
 |      Returns all params ordered by name. The default implementation
 |      uses :py:func:`dir` to get all attributes of type
 |      :py:class:`Param`.
 |  
 |  ----------------------------------------------------------------------
 |  Methods inherited from pyspark.ml.util.Identifiable:
 |  
 |  __repr__(self)
 |      Return repr(self).
 |  
 |  ----------------------------------------------------------------------
 |  Methods inherited from pyspark.ml.util.JavaMLWritable:
 |  
 |  write(self)
 |      Returns an MLWriter instance for this ML instance.
 |  
 |  ----------------------------------------------------------------------
 |  Methods inherited from pyspark.ml.util.MLWritable:
 |  
 |  save(self, path)
 |      Save this ML instance to the given path, a shortcut of 'write().save(path)'.
 |  
 |  ----------------------------------------------------------------------
 |  Class methods inherited from pyspark.ml.util.JavaMLReadable:
 |  
 |  read() from abc.ABCMeta
 |      Returns an MLReader instance for this class.
 |  
 |  ----------------------------------------------------------------------
 |  Class methods inherited from pyspark.ml.util.MLReadable:
 |  
 |  load(path) from abc.ABCMeta
 |      Reads an ML instance from the input path, a shortcut of `read().load(path)`.


als = ALS(
    regParam=0.01,
    userCol = "userId",
    itemCol = "movieId",
    ratingCol = "rating",
    coldStartStrategy="drop"
)


model = als.fit(df)


model

ALSModel: uid=ALS_e8292c58bc5f, rank=10


model.userFactors.show(5)

+---+--------------------+
| id|            features|
+---+--------------------+
| 10|[0.21681188, 1.20...|
| 20|[0.20809321, 0.89...|
| 30|[0.16379297, 0.26...|
| 40|[0.007984038, 1.2...|
| 50|[1.0053813, 0.767...|
+---+--------------------+
only showing top 5 rows


model.userFactors.count()

6040


model.itemFactors.count()

3706


model.userFactors.select("id","features") \
        .toPandas() \
        .to_csv("data_sparkals_user_embedding.csv",index=False)


model.itemFactors.select("id","features") \
        .toPandas() \
        .to_csv("data_sparkals_item_embedding.csv",index=False)


target_user_id = 1


df_movie = pd.read_csv("./dataset/datas/ml-latest-small/movies.csv")
df_rating = pd.read_csv("./dataset/datas/ml-latest-small/ratings.csv")

df_movie_embedding = pd.read_csv("./data_sparkals_item_embedding.csv")
df_user_embedding = pd.read_csv("./data_sparkals_user_embedding.csv")


df_movie_embedding.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3706 entries, 0 to 3705
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        3706 non-null   int64 
 1   features  3706 non-null   object
dtypes: int64(1), object(1)
memory usage: 58.0+ KB


df_movie_embedding


df_user_embedding


# embeding 从字符串向量化
df_movie_embedding["features"] = df_movie_embedding["features"].map(lambda x : np.array(json.loads(x)))
df_user_embedding["features"] = df_user_embedding["features"].map(lambda x : np.array(json.loads(x)))


df_user_embedding.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6040 entries, 0 to 6039
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        6040 non-null   int64 
 1   features  6040 non-null   object
dtypes: int64(1), object(1)
memory usage: 94.5+ KB


df_user_embedding[df_user_embedding["id"]==target_user_id]["features"].iloc[0]

array([-0.68436241,  0.87026972, -0.37516031, -1.56462491,  0.46863654,
       -0.95500118,  1.14803946, -0.41115561, -2.45734978,  0.70034939])


user_embedding=df_user_embedding[df_user_embedding["id"]==1]["features"].iloc[0]


from scipy.spatial import distance


df_movie_embedding["sim_value"] =df_movie_embedding["features"].map(lambda x : 1-distance.cosine(user_embedding,x))


df_movie_embedding


df_rating[df_rating["userId"] == target_user_id]["movieId"].unique()

array([   1,    3,    6,   47,   50,   70,  101,  110,  151,  157,  163,
        216,  223,  231,  235,  260,  296,  316,  333,  349,  356,  362,
        367,  423,  441,  457,  480,  500,  527,  543,  552,  553,  590,
        592,  593,  596,  608,  648,  661,  673,  733,  736,  780,  804,
        919,  923,  940,  943,  954, 1009, 1023, 1024, 1025, 1029, 1030,
       1031, 1032, 1042, 1049, 1060, 1073, 1080, 1089, 1090, 1092, 1097,
       1127, 1136, 1196, 1197, 1198, 1206, 1208, 1210, 1213, 1214, 1219,
       1220, 1222, 1224, 1226, 1240, 1256, 1258, 1265, 1270, 1275, 1278,
       1282, 1291, 1298, 1348, 1377, 1396, 1408, 1445, 1473, 1500, 1517,
       1552, 1573, 1580, 1587, 1617, 1620, 1625, 1644, 1676, 1732, 1777,
       1793, 1804, 1805, 1920, 1927, 1954, 1967, 2000, 2005, 2012, 2018,
       2028, 2033, 2046, 2048, 2054, 2058, 2078, 2090, 2093, 2094, 2096,
       2099, 2105, 2115, 2116, 2137, 2139, 2141, 2143, 2161, 2174, 2193,
       2253, 2268, 2273, 2291, 2329, 2338, 2353, 2366, 2387, 2389, 2395,
       2406, 2414, 2427, 2450, 2459, 2470, 2478, 2492, 2502, 2528, 2529,
       2542, 2571, 2580, 2596, 2616, 2617, 2628, 2640, 2641, 2644, 2648,
       2654, 2657, 2692, 2700, 2716, 2761, 2797, 2826, 2858, 2872, 2899,
       2916, 2944, 2947, 2948, 2949, 2959, 2985, 2987, 2991, 2993, 2997,
       3033, 3034, 3052, 3053, 3062, 3147, 3168, 3176, 3243, 3247, 3253,
       3273, 3386, 3439, 3440, 3441, 3448, 3450, 3479, 3489, 3527, 3578,
       3617, 3639, 3671, 3702, 3703, 3729, 3740, 3744, 3793, 3809, 4006,
       5060], dtype=int64)


watched_ids = set(df_rating[df_rating["userId"] == target_user_id]["movieId"].unique())


len(watched_ids)

232


df_movie_embedding[~df_movie_embedding["id"].isin(watched_ids)]


df_movie_embedding[df_movie_embedding["id"].isin(watched_ids)]


df_movie_embedding[~df_movie_embedding["id"].isin(watched_ids)].sort_values("sim_value",ascending=False).head(10)


df_target_movieIds = df_movie_embedding[~df_movie_embedding["id"].isin(watched_ids)].sort_values("sim_value",ascending=False).head(10)[["id","sim_value"]]


df_target_movieIds


df_target_movieIds


pd.merge(left=df_target_movieIds,right=df_movie,left_on="id",right_on="movieId")[["movieId","title","genres","sim_value"]]


df_rating[df_rating["userId"]==target_user_id]


df_movie[df_movie["movieId"].isin(watched_ids)].merge(df_rating[df_rating["userId"]==target_user_id], on="movieId")


df_movie[df_movie["movieId"].isin(watched_ids)].merge(df_rating[df_rating["userId"]==target_user_id], on="movieId").sort_values("rating",ascending=False)

	id	features
0	10	[-0.18815411627292633, 0.3980417847633362, 0.4...
1	20	[-0.2916702330112457, 0.39180925488471985, -0....
2	30	[-0.15932556986808777, 1.0650469064712524, -0....
3	40	[0.5688132047653198, 0.6850257515907288, 0.328...
4	50	[0.26137569546699524, 0.10741746425628662, 0.2...
...	...	...
3701	3909	[-0.37682488560676575, 0.9805638790130615, -0....
3702	3919	[-0.2180541306734085, 0.9555624723434448, 0.72...
3703	3929	[0.2916938066482544, 1.0744397640228271, 0.254...
3704	3939	[-0.7031605243682861, 1.002614140510559, 0.204...
3705	3949	[0.4849887490272522, 0.5978550314903259, -0.02...

	id	features
0	10	[0.2168118804693222, 1.2006480693817139, 0.790...
1	20	[0.20809321105480194, 0.8956680297851562, 1.48...
2	30	[0.16379296779632568, 0.26286977529525757, 0.7...
3	40	[0.007984038442373276, 1.251495122909546, 0.88...
4	50	[1.0053813457489014, 0.7675292491912842, 0.526...
...	...	...
6035	5999	[0.5730876922607422, 1.1653149127960205, 0.781...
6036	6009	[0.17401114106178284, 0.4095298945903778, 0.16...
6037	6019	[-0.5992759466171265, 1.0960370302200317, 0.21...
6038	6029	[1.2999039888381958, 0.9374491572380066, 0.136...
6039	6039	[0.2989945411682129, 0.741441547870636, 0.8007...

	id	features	sim_value
0	10	[-0.18815411627292633, 0.3980417847633362, 0.4...	0.705337
1	20	[-0.2916702330112457, 0.39180925488471985, -0....	0.708213
2	30	[-0.15932556986808777, 1.0650469064712524, -0....	0.629950
3	40	[0.5688132047653198, 0.6850257515907288, 0.328...	0.408502
4	50	[0.26137569546699524, 0.10741746425628662, 0.2...	0.669305
...	...	...	...
3701	3909	[-0.37682488560676575, 0.9805638790130615, -0....	0.631404
3702	3919	[-0.2180541306734085, 0.9555624723434448, 0.72...	0.290478
3703	3929	[0.2916938066482544, 1.0744397640228271, 0.254...	0.487208
3704	3939	[-0.7031605243682861, 1.002614140510559, 0.204...	0.367001
3705	3949	[0.4849887490272522, 0.5978550314903259, -0.02...	0.487794

	id	features	sim_value
0	10	[-0.18815411627292633, 0.3980417847633362, 0.4...	0.705337
1	20	[-0.2916702330112457, 0.39180925488471985, -0....	0.708213
2	30	[-0.15932556986808777, 1.0650469064712524, -0....	0.629950
3	40	[0.5688132047653198, 0.6850257515907288, 0.328...	0.408502
5	60	[0.2520754635334015, 0.7954497337341309, -0.42...	0.817684
...	...	...	...
3701	3909	[-0.37682488560676575, 0.9805638790130615, -0....	0.631404
3702	3919	[-0.2180541306734085, 0.9555624723434448, 0.72...	0.290478
3703	3929	[0.2916938066482544, 1.0744397640228271, 0.254...	0.487208
3704	3939	[-0.7031605243682861, 1.002614140510559, 0.204...	0.367001
3705	3949	[0.4849887490272522, 0.5978550314903259, -0.02...	0.487794

	id	features	sim_value
4	50	[0.26137569546699524, 0.10741746425628662, 0.2...	0.669305
6	70	[-0.3985518217086792, 0.8255515098571777, 1.23...	0.335697
10	110	[-0.3063579797744751, -0.16125208139419556, 0....	0.789138
25	260	[0.05825613811612129, 0.012431434355676174, 0....	0.617702
46	480	[0.115965835750103, 0.12412750720977783, 0.765...	0.710630
...	...	...	...
3660	3479	[0.1421741545200348, 0.4269544184207916, 0.144...	0.692919
3661	3489	[-0.2083592563867569, 0.229205921292305, 0.163...	0.613882
3675	3639	[-0.3138227164745331, 0.09211219847202301, 0.1...	0.707233
3684	3729	[-0.14264270663261414, 0.3568372130393982, -0....	0.857207
3692	3809	[-0.23935143649578094, 0.6940127611160278, -0....	0.715511

背景知识：¶

演示目标：¶

延伸：¶

实现sparkALS的矩阵分解¶

对于给定用户算出可能最喜欢的10个电影¶

计算 user embedding 和 item embedding 的相似度¶

查询ID的电影名和信息展现给用户¶

	id	features	sim_value
753	122	[-0.2713378965854645, 0.5307934284210205, -0.3...	0.952694
1298	2013	[-0.22150848805904388, 0.17947401106357574, -0...	0.946765
299	3250	[0.10610243678092957, 0.289923757314682, -0.08...	0.933755
221	2420	[-0.11612560600042343, 0.21996952593326569, 0....	0.932687
3384	479	[-0.44652122259140015, 0.4711386561393738, -0....	0.932301
1687	2114	[-0.22058504819869995, 0.32801997661590576, -0...	0.931865
138	1470	[-0.18237100541591644, 0.39677849411964417, 0....	0.927258
325	3510	[-0.18631470203399658, 0.19551320374011993, 0....	0.918885
3187	2358	[-0.5845286250114441, 0.6346001625061035, 0.02...	0.917102
1339	2423	[-0.624200701713562, 0.22440963983535767, 0.12...	0.910244

	movieId	title	genres	sim_value
0	122	Boomerang (1992)	Comedy\|Romance	0.952694
1	2013	Poseidon Adventure, The (1972)	Action\|Adventure\|Drama	0.946765
2	3250	Alive (1993)	Drama	0.933755
3	2420	Karate Kid, The (1984)	Drama	0.932687
4	479	Judgment Night (1993)	Action\|Crime\|Thriller	0.932301
5	2114	Outsiders, The (1983)	Drama	0.931865
6	3510	Frequency (2000)	Drama\|Thriller	0.918885
7	2358	Savior (1998)	Drama\|War	0.917102
8	2423	Christmas Vacation (National Lampoon's Christm...	Comedy	0.910244

	userId	movieId	rating	timestamp
0	1	1	4.0	964982703
1	1	3	4.0	964981247
2	1	6	4.0	964982224
3	1	47	5.0	964983815
4	1	50	5.0	964982931
...	...	...	...	...
227	1	3744	4.0	964980694
228	1	3793	5.0	964981855
229	1	3809	4.0	964981220
230	1	4006	4.0	964982903
231	1	5060	5.0	964984002

	movieId	title	genres	userId	rating	timestamp
0	1	Toy Story (1995)	Adventure\|Animation\|Children\|Comedy\|Fantasy	1	4.0	964982703
1	3	Grumpier Old Men (1995)	Comedy\|Romance	1	4.0	964981247
2	6	Heat (1995)	Action\|Crime\|Thriller	1	4.0	964982224
3	47	Seven (a.k.a. Se7en) (1995)	Mystery\|Thriller	1	5.0	964983815
4	50	Usual Suspects, The (1995)	Crime\|Mystery\|Thriller	1	5.0	964982931
...	...	...	...	...	...	...
227	3744	Shaft (2000)	Action\|Crime\|Thriller	1	4.0	964980694
228	3793	X-Men (2000)	Action\|Adventure\|Sci-Fi	1	5.0	964981855
229	3809	What About Bob? (1991)	Comedy	1	4.0	964981220
230	4006	Transformers: The Movie (1986)	Adventure\|Animation\|Children\|Sci-Fi	1	4.0	964982903
231	5060	MAS*H (a.k.a. MASH) (1970)	Comedy\|Drama\|War	1	5.0	964984002

	movieId	title	genres	userId	rating	timestamp
231	5060	MAS*H (a.k.a. MASH) (1970)	Comedy\|Drama\|War	1	5.0	964984002
185	2872	Excalibur (1981)	Adventure\|Fantasy	1	5.0	964981680
89	1291	Indiana Jones and the Last Crusade (1989)	Action\|Adventure	1	5.0	964981909
90	1298	Pink Floyd: The Wall (1982)	Drama\|Musical	1	5.0	964984086
190	2948	From Russia with Love (1963)	Action\|Adventure\|Thriller	1	5.0	964982191
...	...	...	...	...	...	...
170	2617	Mummy, The (1999)	Action\|Adventure\|Comedy\|Fantasy\|Horror\|Thriller	1	2.0	964982588
143	2253	Toys (1992)	Comedy\|Fantasy	1	2.0	964981775
148	2338	I Still Know What You Did Last Summer (1998)	Horror\|Mystery\|Thriller	1	2.0	964983546
152	2389	Psycho (1998)	Crime\|Horror\|Thriller	1	2.0	964983094
205	3176	Talented Mr. Ripley, The (1999)	Drama\|Mystery\|Thriller	1	1.0	964983504