From 51ade51a9fd64fc2fe651c505a286e6f29f59d40 Mon Sep 17 00:00:00 2001
From: Sean Owen <sowen@cloudera.com>
Date: Wed, 13 Jul 2016 11:39:32 +0100
Subject: [PATCH] [SPARK-16440][MLLIB] Undeleted broadcast variables in
 Word2Vec causing OoM for long runs

## What changes were proposed in this pull request?

Unpersist broadcasted vars in Word2Vec.fit for more timely / reliable resource cleanup

## How was this patch tested?

Jenkins tests

Author: Sean Owen <sowen@cloudera.com>

Closes #14153 from srowen/SPARK-16440.
---
 .../main/scala/org/apache/spark/mllib/feature/Word2Vec.scala   | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
index f2211df3f9..6b9c8ee2e3 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
@@ -434,6 +434,9 @@ class Word2Vec extends Serializable with Logging {
       bcSyn1Global.unpersist(false)
     }
     newSentences.unpersist()
+    expTable.unpersist()
+    bcVocab.unpersist()
+    bcVocabHash.unpersist()
 
     val wordArray = vocab.map(_.word)
     new Word2VecModel(wordArray.zipWithIndex.toMap, syn0Global)
-- 
GitLab