diff --git a/build.gradle b/build.gradle index 6b97dd0291..a4a30bf5d0 100644 --- a/build.gradle +++ b/build.gradle @@ -53,8 +53,8 @@ configure(allProjs) { ext { scalaVersion = '2.11' scalaVersionRevision = '8' - scalaTestVersion = '3.0.0' - scalaCheckVersion = '1.13.5' + scalaTestVersion = '3.0.5' + scalaCheckVersion = '1.14.0' junitVersion = '4.11' avroVersion = '1.7.7' sparkVersion = '2.2.1' @@ -68,7 +68,7 @@ configure(allProjs) { jodaConvertVersion = '1.8.1' algebirdVersion = '0.12.3' jacksonVersion = '2.7.3' - luceneVersion = '7.1.0' + luceneVersion = '7.3.0' enumeratumVersion = '1.4.12' scoptVersion = '3.5.0' googleLibPhoneNumberVersion = '8.8.5' @@ -179,7 +179,8 @@ configure(allProjs) { header = rootProject.file('LICENSE.txt') ignoreFailures = true include '**/*.java', '**/*.scala' - exclude '**/com/salesforce/op/utils/io/DirectMapreduceOutputCommitter.scala', + exclude '**/org/apache/spark/ml/SparkDefaultParamsReadWrite.scala', + '**/com/salesforce/op/utils/io/DirectMapreduceOutputCommitter.scala', '**/com/salesforce/op/test/TestSparkContext.scala', '**/com/salesforce/op/test/TempDirectoryTest.scala', '**/com/salesforce/op/utils/io/DirectOutputCommitter.scala', diff --git a/cli/src/main/scala/com/salesforce/op/cli/gen/templates/BinaryFeatureTemplate.scala b/cli/src/main/scala/com/salesforce/op/cli/gen/templates/BinaryFeatureTemplate.scala index a1a2306dfb..7ea30826c8 100644 --- a/cli/src/main/scala/com/salesforce/op/cli/gen/templates/BinaryFeatureTemplate.scala +++ b/cli/src/main/scala/com/salesforce/op/cli/gen/templates/BinaryFeatureTemplate.scala @@ -33,13 +33,14 @@ package com.salesforce.op.cli.gen.templates import com.salesforce.op.features.{FeatureBuilder => FB} import com.salesforce.op.features.types._ + /** * This is a template for generating binary feature handling in a generated project */ class BinaryFeatureTemplate { private[templates] def feature = // BEGIN - FB.Binary[SampleObject] - .extract(o => Option(o.codeGeneration_binaryField_codeGeneration).map(_.booleanValue).toBinary) + FB.Binary[SampleObject] + .extract(o => Option(o.codeGeneration_binaryField_codeGeneration).map(_.booleanValue).toBinary) // END } diff --git a/core/build.gradle b/core/build.gradle index b931f91b28..5290a7e4d0 100644 --- a/core/build.gradle +++ b/core/build.gradle @@ -1,5 +1,6 @@ dependencies { compile project(':readers') + testRuntime project(':models') testCompile project(':testkit') // Google libphonenumber @@ -16,6 +17,7 @@ dependencies { // Lucene text analysis compile "org.apache.lucene:lucene-analyzers-common:$luceneVersion" compile "org.apache.lucene:lucene-analyzers-kuromoji:$luceneVersion" + compile "org.apache.lucene:lucene-analyzers-opennlp:$luceneVersion" compile "org.apache.lucene:lucene-suggest:$luceneVersion" // Scopt diff --git a/core/src/main/scala/com/salesforce/op/OpWorkflow.scala b/core/src/main/scala/com/salesforce/op/OpWorkflow.scala index c112981524..c6a01d72c0 100644 --- a/core/src/main/scala/com/salesforce/op/OpWorkflow.scala +++ b/core/src/main/scala/com/salesforce/op/OpWorkflow.scala @@ -31,14 +31,18 @@ package com.salesforce.op -import com.salesforce.op.features.{Feature, OPFeature} +import com.salesforce.op.features.OPFeature import com.salesforce.op.filters.RawFeatureFilter import com.salesforce.op.readers.Reader import com.salesforce.op.stages.OPStage +import com.salesforce.op.stages.impl.preparators.CorrelationType +import com.salesforce.op.stages.impl.selector.ModelSelectorBase +import com.salesforce.op.utils.spark.RichDataset._ import com.salesforce.op.utils.reflection.ReflectionUtils import com.salesforce.op.utils.stages.FitStagesUtil +import com.salesforce.op.utils.stages.FitStagesUtil.{CutDAG, FittedDAG, Layer, StagesDAG} import org.apache.spark.annotation.Experimental -import org.apache.spark.ml.Transformer +import org.apache.spark.ml.{Estimator, Transformer} import org.apache.spark.sql.{DataFrame, SparkSession} import scala.collection.mutable.{MutableList => MList} @@ -163,7 +167,7 @@ class OpWorkflow(val uid: String = UID[OpWorkflow]) extends OpWorkflowCore { Try { stage.set(stage.getParam(k), v) } orElse { - Try { ReflectionUtils.reflectSetterMethod(stage, k).get.apply(v) } + Try { ReflectionUtils.reflectSetterMethod(stage, k, Seq(v)) } } if (setStage.isFailure) log.error( s"Setting parameter $k with value $v for stage $stage with params ${stage.params.toList} failed with an error", @@ -180,7 +184,7 @@ class OpWorkflow(val uid: String = UID[OpWorkflow]) extends OpWorkflowCore { */ private def setStagesDAG(features: Array[OPFeature]): OpWorkflow.this.type = { // Unique stages layered by distance - val uniqueStagesLayered = DAG.compute(features) + val uniqueStagesLayered = FitStagesUtil.computeDAG(features) if (log.isDebugEnabled) { val total = uniqueStagesLayered.map(_.length).sum @@ -311,11 +315,18 @@ class OpWorkflow(val uid: String = UID[OpWorkflow]) extends OpWorkflowCore { */ def train(persistEveryKStages: Int = OpWorkflowModel.PersistEveryKStages) (implicit spark: SparkSession): OpWorkflowModel = { - val rawData = generateRawData() - // Update features with fitted stages - val fittedStages = fitStages(data = rawData, stagesToFit = stages, persistEveryKStages) - val newResultFeatures = resultFeatures.map(_.copyWithNewStages(fittedStages)) + val (fittedStages, newResultFeatures) = + if (stages.exists(_.isInstanceOf[Estimator[_]])) { + val rawData = generateRawData() + + // Update features with fitted stages + val fittedStgs = fitStages(data = rawData, stagesToFit = stages, persistEveryKStages) + val newResultFtrs = resultFeatures.map(_.copyWithNewStages(fittedStgs)) + fittedStgs -> newResultFtrs + } else { + stages -> resultFeatures + } val model = new OpWorkflowModel(uid, getParameters()) @@ -327,6 +338,93 @@ class OpWorkflow(val uid: String = UID[OpWorkflow]) extends OpWorkflowCore { reader.map(model.setReader).getOrElse(model) } + /** + * Fit the estimators to return a sequence of only transformers + * Modified version of Spark 2.x Pipeline + * + * @param data dataframe to fit on + * @param stagesToFit stages that need to be converted to transformers + * @param persistEveryKStages persist data in transforms every k stages for performance improvement + * @return fitted transformers + */ + protected def fitStages(data: DataFrame, stagesToFit: Array[OPStage], persistEveryKStages: Int) + (implicit spark: SparkSession): Array[OPStage] = { + + // TODO may want to make workflow take an optional reserve fraction + val splitters = stagesToFit.collect { case s: ModelSelectorBase[_, _] => s.splitter }.flatten + val splitter = splitters.reduceOption { (a, b) => + if (a.getReserveTestFraction > b.getReserveTestFraction) a else b + } + val (train, test) = splitter.map(_.split(data)).getOrElse((data, spark.emptyDataFrame)) + val hasTest = !test.isEmpty + + val dag = FitStagesUtil.computeDAG(resultFeatures) + .map(_.filter(s => stagesToFit.contains(s._1))) + .filter(_.nonEmpty) + + // Search for the last estimator + val indexOfLastEstimator: Option[Int] = + dag.collect { case seq if seq.exists(_._1.isInstanceOf[Estimator[_]]) => seq.head._2 }.lastOption + + // doing regular workflow fit without workflow level CV + if (!isWorkflowCV) { + FitStagesUtil.fitAndTransformDAG( + dag = dag, + train = train, + test = test, + hasTest = hasTest, + indexOfLastEstimator = indexOfLastEstimator, + persistEveryKStages = persistEveryKStages + ).transformers + } else { + // doing workflow level CV/TS + // Extract Model Selector and Split the DAG into + val CutDAG(modelSelectorOpt, before, during, after) = FitStagesUtil.cutDAG(dag) + + log.info("Applying initial DAG before CV/TS. Stages: {}", before.flatMap(_.map(_._1.stageName)).mkString(", ")) + val FittedDAG(beforeTrain, beforeTest, beforeTransformers) = FitStagesUtil.fitAndTransformDAG( + dag = before, + train = train, + test = test, + hasTest = hasTest, + indexOfLastEstimator = indexOfLastEstimator, + persistEveryKStages = persistEveryKStages + ) + + // Break up catalyst (cause it chokes) by converting into rdd, persisting it and then back to dataframe + val (trainRDD, testRDD) = (beforeTrain.rdd.persist(), beforeTest.rdd.persist()) + val (trainFixed, testFixed) = ( + spark.createDataFrame(trainRDD, beforeTrain.schema), + spark.createDataFrame(testRDD, beforeTest.schema) + ) + + modelSelectorOpt match { + case None => beforeTransformers + case Some((modelSelector, distance)) => + // estimate best model + log.info("Estimate best Model with CV/TS. Stages included in CV are: {}, {}", + during.flatMap(_.map(_._1.stageName)).mkString(", "), modelSelector.uid: Any + ) + modelSelector.findBestEstimator(trainFixed, during, persistEveryKStages) + val remainingDAG: StagesDAG = (during :+ (Array(modelSelector -> distance): Layer)) ++ after + + log.info("Applying DAG after CV/TS. Stages: {}", remainingDAG.flatMap(_.map(_._1.stageName)).mkString(", ")) + val fitted = FitStagesUtil.fitAndTransformDAG( + dag = remainingDAG, + train = trainFixed, + test = testFixed, + hasTest = hasTest, + indexOfLastEstimator = indexOfLastEstimator, + persistEveryKStages = persistEveryKStages, + fittedTransformers = beforeTransformers + ).transformers + trainRDD.unpersist() + testRDD.unpersist() + fitted + } + } + } + /** * Replaces any estimators in this workflow with their corresponding fit models from the OpWorkflowModel * passed in. Note that the Stages UIDs must EXACTLY correspond in order to be replaced so the same features @@ -352,15 +450,25 @@ class OpWorkflow(val uid: String = UID[OpWorkflow]) extends OpWorkflowCore { def loadModel(path: String): OpWorkflowModel = new OpWorkflowModelReader(this).load(path) /** - * Returns a dataframe containing all the columns generated up to the feature input + * Returns a dataframe containing all the columns generated up to and including the feature input * * @param feature input feature to compute up to * @param persistEveryKStages persist data in transforms every k stages for performance improvement - * @return Dataframe containing columns corresponding to all of the features generated before the feature given + * @return Dataframe containing columns corresponding to all of the features generated up to the feature given */ def computeDataUpTo(feature: OPFeature, persistEveryKStages: Int = OpWorkflowModel.PersistEveryKStages) (implicit spark: SparkSession): DataFrame = { - computeDataUpTo(stopStage = findOriginStageId(feature), fitted = false, persistEveryKStages) + if (findOriginStageId(feature).isEmpty) { + log.warn("Could not find origin stage for feature in workflow!! Defaulting to generate raw features.") + generateRawData() + } else { + val rawData = generateRawData() + val stagesToFit = FitStagesUtil.computeDAG(Array(feature)).flatMap(_.map(_._1)) + val fittedStages = fitStages(rawData, stagesToFit, persistEveryKStages) + val updatedFeature = feature.copyWithNewStages(fittedStages) + val dag = FitStagesUtil.computeDAG(Array(updatedFeature)) + applyTransformationsDAG(rawData, dag, persistEveryKStages) + } } /** @@ -383,16 +491,35 @@ class OpWorkflow(val uid: String = UID[OpWorkflow]) extends OpWorkflowCore { * @tparam T Type of the data read in */ @Experimental - def withRawFeatureFilter[T](trainingReader: Option[Reader[T]], scoringReader: Option[Reader[T]], - bins: Int = 100, minFillRate: Double = 0.001, maxFillDifference: Double = 0.90, - maxFillRatioDiff: Double = 20.0, maxJSDivergence: Double = 0.90, protectedFeatures: Array[OPFeature] = Array.empty + def withRawFeatureFilter[T]( + trainingReader: Option[Reader[T]], + scoringReader: Option[Reader[T]], + bins: Int = 100, + minFillRate: Double = 0.001, + maxFillDifference: Double = 0.90, + maxFillRatioDiff: Double = 20.0, + maxJSDivergence: Double = 0.90, + maxCorrelation: Double = 0.95, + correlationType: CorrelationType = CorrelationType.Pearson, + protectedFeatures: Array[OPFeature] = Array.empty ): this.type = { val training = trainingReader.orElse(reader).map(_.asInstanceOf[Reader[T]]) require(training.nonEmpty, "Reader for training data must be provided either in withRawFeatureFilter or directly" + "as the reader for the workflow") val protectedRawFeatures = protectedFeatures.flatMap(_.rawFeatures).map(_.name).toSet - rawFeatureFilter = Option( new RawFeatureFilter(training.get, scoringReader, bins, minFillRate, - maxFillDifference, maxFillRatioDiff, maxJSDivergence, protectedRawFeatures) ) + rawFeatureFilter = Option { + new RawFeatureFilter( + trainingReader = training.get, + scoreReader = scoringReader, + bins = bins, + minFill = minFillRate, + maxFillDifference = maxFillDifference, + maxFillRatioDiff = maxFillRatioDiff, + maxJSDivergence = maxJSDivergence, + maxCorrelation = maxCorrelation, + correlationType = correlationType, + protectedFeatures = protectedRawFeatures) + } this } diff --git a/core/src/main/scala/com/salesforce/op/OpWorkflowCore.scala b/core/src/main/scala/com/salesforce/op/OpWorkflowCore.scala index bec870c95b..ee0e268244 100644 --- a/core/src/main/scala/com/salesforce/op/OpWorkflowCore.scala +++ b/core/src/main/scala/com/salesforce/op/OpWorkflowCore.scala @@ -31,84 +31,33 @@ package com.salesforce.op -import com.salesforce.op.DAG.{Layer, StagesDAG} +import com.salesforce.op.utils.stages.FitStagesUtil._ +import com.salesforce.op.utils.stages.FitStagesUtil import com.salesforce.op.features.OPFeature import com.salesforce.op.features.types.FeatureType import com.salesforce.op.readers.{CustomReader, Reader, ReaderKey} -import com.salesforce.op.stages.impl.selector.ModelSelectorBase import com.salesforce.op.stages.{FeatureGeneratorStage, OPStage, OpTransformer} import com.salesforce.op.utils.spark.RichDataset._ -import com.salesforce.op.utils.stages.FitStagesUtil +import org.apache.spark.annotation.Experimental import org.apache.spark.ml._ import org.apache.spark.rdd.RDD import org.apache.spark.sql.{DataFrame, Dataset, SparkSession} import org.slf4j.LoggerFactory -import scala.collection.mutable.ListBuffer import scala.reflect.runtime.universe.WeakTypeTag -private[op] case object DAG { - - private[op] type Layer = Array[(OPStage, Int)] - private[op] type StagesDAG = Array[Layer] - - /** - * Computes stages DAG - * - * @param features array if features in workflow - * @return unique stages layered by distance (desc order) - */ - def compute(features: Array[OPFeature]): StagesDAG = { - - val (failures, parents) = features.map(_.parentStages()).partition(_.isFailure) - - if (failures.nonEmpty) { - throw new IllegalArgumentException("Failed to compute stages DAG", failures.head.failed.get) - } - - // Stages sorted by distance - val sortedByDistance: Array[(OPStage, Int)] = parents.flatMap(_.get) - - // Stages layered by distance - val layeredByDistance: StagesDAG = createLayers(sortedByDistance) - - - // Unique stages layered by distance - layeredByDistance - .foldLeft(Set.empty[OPStage], Array.empty[Array[(OPStage, Int)]]) { - case ((seen, filtered), uncleaned) => - // filter out any seen stages. also add distinct to filter out any duplicate stages in layer - val unseen = uncleaned.filterNot(v => seen.contains(v._1)).distinct - val nowSeen = seen ++ unseen.map(_._1) - (nowSeen, filtered :+ unseen) - }._2 - } - - /** - * Layers Stages by distance - * - * @param stages stages sorted by distance - * @return stages layered by distance - */ - def createLayers(stages: Array[(OPStage, Int)]): StagesDAG = { - stages.groupBy(_._2).toArray - .map(_._2.sortBy(_._1.getOutputFeatureName)) - .sortBy(s => -s.head._2) - } -} - /** * Parameters for pipelines and pipeline models */ private[op] trait OpWorkflowCore { - @transient implicit protected lazy val log = LoggerFactory.getLogger(this.getClass) + @transient protected lazy val log = LoggerFactory.getLogger(this.getClass) // the uid of the stage def uid: String - // Model Selector - private[op] type MS = ModelSelectorBase[_ <: Model[_], _ <: Estimator[_]] + // whether the CV/TV is performed on the workflow level + private[op] var isWorkflowCV = false // the data reader for the workflow or model private[op] var reader: Option[Reader[_]] = None @@ -138,6 +87,19 @@ private[op] trait OpWorkflowCore { this } + /** + * :: Experimental :: + * Decides whether the cross-validation/train-validation-split will be done at workflow level + * This will remove issues with data leakage, however it will impact the runtime + * + * @return this workflow that will train part of the DAG in the cross-validation/train validation split + */ + @Experimental + final def withWorkflowCV: this.type = { + isWorkflowCV = true + this + } + /** * Set data reader that will be used to generate data frame for stages @@ -256,83 +218,6 @@ private[op] trait OpWorkflowCore { */ protected def generateRawData()(implicit spark: SparkSession): DataFrame - /** - * Fit the estimators to return a sequence of only transformers - * Modified version of Spark 2.x Pipeline - * - * @param data dataframe to fit on - * @param stagesToFit stages that need to be converted to transformers - * @param persistEveryKStages persist data in transforms every k stages for performance improvement - * @return fitted transformers - */ - protected def fitStages(data: DataFrame, stagesToFit: Array[OPStage], persistEveryKStages: Int) - (implicit spark: SparkSession): Array[OPStage] = { - - // TODO may want to make workflow take an optional reserve fraction - val splitters = stagesToFit.collect{ case s: ModelSelectorBase[_, _] => s.splitter }.flatten - val splitter = splitters.reduceOption{ (a, b) => if (a.getReserveTestFraction > b.getReserveTestFraction) a else b } - val (train, test) = splitter.map(_.split(data)).getOrElse{ (data, spark.emptyDataFrame) } - val hasTest = !test.isEmpty - - val dag = DAG.compute(resultFeatures) - .map(_.filter(s => stagesToFit.contains(s._1))) - .filter(_.nonEmpty) - - // Search for the last estimator - val indexOfLastEstimator = dag - .collect { case seq if seq.exists( _._1.isInstanceOf[Estimator[_]] ) => seq.head._2 } - .lastOption - - val transformers = ListBuffer.empty[OPStage] - - dag.foldLeft((train.toDF(), test.toDF())) { - case ((currTrain, currTest), stagesLayer) => - val index = stagesLayer.head._2 - - val (newTrain, newTest, fitTransform) = FitStagesUtil.fitAndTransform( - train = currTrain, - test = currTest, - stages = stagesLayer.map(_._1), - transformData = indexOfLastEstimator.exists(_ < index), // only need to update for fit before last estimator - persistEveryKStages = persistEveryKStages, - doTest = Some(hasTest) - ) - - transformers.append(fitTransform: _*) - newTrain -> newTest - } - transformers.toArray - } - - - /** - * Returns a Dataframe containing all the columns generated up to the stop stage - * @param stopStage last stage to apply - * @param persistEveryKStages persist data in transforms every k stages for performance improvement - * @return Dataframe containing columns corresponding to all of the features generated before the feature given - */ - protected def computeDataUpTo(stopStage: Option[Int], fitted: Boolean, persistEveryKStages: Int) - (implicit spark: SparkSession): DataFrame = { - if (stopStage.isEmpty) { - log.warn("Could not find origin stage for feature in workflow!! Defaulting to generate raw features.") - generateRawData() - } else { - val featureStages = stages.slice(0, stopStage.get) - log.info("Found parent stage and computing features up to that stage:\n{}", - featureStages.map(s => s.uid + " --> " + s.getOutputFeatureName).mkString("\n") - ) - val rawData = generateRawData() - - if (!fitted) { - val stages = fitStages(rawData, featureStages, persistEveryKStages) - .map(_.asInstanceOf[Transformer]) - FitStagesUtil.applySparkTransformations(rawData, stages, persistEveryKStages) // TODO use DAG transform - } else { - featureStages.foldLeft(rawData)((data, stage) => stage.asInstanceOf[Transformer].transform(data)) - } - } - } - /** * Returns a dataframe containing all the columns generated up to the feature input * @@ -353,64 +238,6 @@ private[op] trait OpWorkflowCore { df.saveAvro(path) } - /** - * Method that cut DAG in order to perform proper CV/TS - * - * @param dag DAG in the workflow to be cut - * @return (Model Selector, nonCVTS DAG -to be done outside of CV/TS, CVTS DAG -to apply in the CV/TS) - */ - protected[op] def cutDAG(dag: StagesDAG): (Option[MS], StagesDAG, StagesDAG) = { - if (dag.isEmpty) (None, Array.empty, Array.empty) else { - // creates Array containing every Model Selector in the DAG - val modelSelectorArrays = dag.flatten.collect { case (ms: MS, dist: Int) => (ms, dist) } - val modelSelector = modelSelectorArrays.toList match { - case Nil => None - case List(ms) => Option(ms) - case modelSelectors => throw new IllegalArgumentException( - s"OpWorkflow can contain at most 1 Model Selector. Found ${modelSelectors.length} Model Selectors :" + - s" ${modelSelectors.map(_._1).mkString(",")}") - } - - // nonCVTS and CVTS DAGs - val (nonCVTSDAG: StagesDAG, cVTSDAG: StagesDAG) = modelSelector.map { case (ms, dist) => - // Optimize the DAG by removing stages unrelated to ModelSelector - val modelSelectorDAG = DAG.compute(Array(ms.getOutput())).dropRight(1) - - // Create the DAG without Model Selector. It will be used to compute the final nonCVTS DAG. - val nonMSDAG: StagesDAG = { - dag.filter(_.exists(_._2 >= dist)).toList match { - case stages :: Nil => Array(stages.filterNot(_._1.isInstanceOf[MS])) - case xs :+ x => xs.toArray :+ x.filterNot(_._1.isInstanceOf[MS]) - } - }.filter(!_.isEmpty) // Remove empty layers - - // Index of first CVTS stage in ModelSelector DAG - val firstCVTSIndex = modelSelectorDAG.toList.indexWhere(_.exists(stage => { - val inputs = stage._1.getTransientFeatures() - inputs.exists(_.isResponse) && inputs.exists(!_.isResponse) - })) - - // If no CVTS stages, the whole DAG is not in the CV/TS - if (firstCVTSIndex == -1) (nonMSDAG, Array.empty[Layer]) else { - - val cVTSDAG = modelSelectorDAG.drop(firstCVTSIndex) - - // nonCVTSDAG is the complementary DAG - // The rule is "nonCVTSDAG = nonMSDAG - CVTSDAG" - val nonCVTSDAG = { - val flattenedCVTSDAG = cVTSDAG.flatten.map(_._1) - nonMSDAG.map(_.filterNot { case (stage: OPStage, _) => flattenedCVTSDAG.contains(stage) }) - .filter(!_.isEmpty) // Remove empty layers - } - - (nonCVTSDAG, cVTSDAG) - } - }.getOrElse((Array.empty[Layer], Array.empty[Layer])) - (modelSelector.map(_._1), nonCVTSDAG, cVTSDAG) - } - } - - /** * Efficiently applies all fitted stages grouping by level in the DAG where possible * @@ -425,6 +252,9 @@ private[op] trait OpWorkflowCore { )(implicit spark: SparkSession): DataFrame = { // A holder for the last persisted rdd var lastPersisted: Option[DataFrame] = None + if (dag.exists(_.exists(_._1.isInstanceOf[Estimator[_]]))) { + throw new IllegalArgumentException("Cannot apply transformations to DAG that contains estimators") + } // Apply stages layer by layer dag.foldLeft(rawData) { case (df, stagesLayer) => diff --git a/core/src/main/scala/com/salesforce/op/OpWorkflowModel.scala b/core/src/main/scala/com/salesforce/op/OpWorkflowModel.scala index a40f2ae8f8..665dde3277 100644 --- a/core/src/main/scala/com/salesforce/op/OpWorkflowModel.scala +++ b/core/src/main/scala/com/salesforce/op/OpWorkflowModel.scala @@ -38,15 +38,14 @@ import com.salesforce.op.readers.DataFrameFieldNames._ import com.salesforce.op.stages.{OPStage, OpPipelineStage, OpTransformer} import com.salesforce.op.utils.spark.RichDataset._ import com.salesforce.op.utils.spark.RichMetadata._ -import org.apache.spark.ml._ -import org.apache.spark.rdd.RDD +import com.salesforce.op.utils.stages.FitStagesUtil +import org.apache.spark.ml.Estimator import org.apache.spark.sql.types.Metadata -import org.apache.spark.sql.{DataFrame, Row, SparkSession} +import org.apache.spark.sql.{DataFrame, SparkSession} import org.json4s.JValue import org.json4s.JsonAST.{JField, JObject} import org.json4s.jackson.JsonMethods.{pretty, render} -import scala.collection.mutable.ArrayBuffer import scala.reflect.ClassTag @@ -93,15 +92,22 @@ class OpWorkflowModel(val uid: String = UID[OpWorkflowModel], val trainingParams } /** - * Returns a dataframe containing all the columns generated up to the feature input + * Returns a dataframe containing all the columns generated up to and including the feature input * * @param feature input feature to compute up to * @throws IllegalArgumentException if a feature is not part of this workflow - * @return Dataframe containing columns corresponding to all of the features generated before the feature given + * @return Dataframe containing columns corresponding to all of the features generated up to the feature given */ def computeDataUpTo(feature: OPFeature, persistEveryKStages: Int = OpWorkflowModel.PersistEveryKStages) (implicit spark: SparkSession): DataFrame = { - computeDataUpTo(stopStage = findOriginStageId(feature), fitted = true, persistEveryKStages = persistEveryKStages) + if (findOriginStageId(feature).isEmpty) { + log.warn("Could not find origin stage for feature in workflow!! Defaulting to generate raw features.") + generateRawData() + } else { + val fittedFeature = feature.copyWithNewStages(stages) + val dag = FitStagesUtil.computeDAG(Array(fittedFeature)) + applyTransformationsDAG(generateRawData(), dag, persistEveryKStages) + } } /** @@ -123,7 +129,6 @@ class OpWorkflowModel(val uid: String = UID[OpWorkflowModel], val trainingParams * @throws IllegalArgumentException if a feature is not part of this workflow * @return Updated instance of feature */ - // TODO change this method to give you raw features for use in stacked workflows def getUpdatedFeatures(features: Array[OPFeature]): Array[OPFeature] = { val allFeatures = rawFeatures ++ blacklistedFeatures ++ stages.map(_.getOutput()) features.map{f => allFeatures.find(_.sameOrigin(f)) @@ -307,9 +312,8 @@ class OpWorkflowModel(val uid: String = UID[OpWorkflowModel], val trainingParams require(persistEveryKStages >= 1, s"persistEveryKStages value of $persistEveryKStages is invalid must be >= 1") // TODO: replace 'stages' with 'stagesDag'. (is a breaking change for serialization, but would simplify scoreFn) - // Pre-compute transformations dag - val dag = DAG.compute(resultFeatures) + val dag = FitStagesUtil.computeDAG(resultFeatures) (path: Option[String]) => { // Generate the dataframe with raw features diff --git a/core/src/main/scala/com/salesforce/op/dsl/RichDateFeature.scala b/core/src/main/scala/com/salesforce/op/dsl/RichDateFeature.scala index 1a63fdc36d..324145b3c0 100644 --- a/core/src/main/scala/com/salesforce/op/dsl/RichDateFeature.scala +++ b/core/src/main/scala/com/salesforce/op/dsl/RichDateFeature.scala @@ -95,7 +95,6 @@ trait RichDateFeature { trackNulls: Boolean = TransmogrifierDefaults.TrackNulls, others: Array[FeatureLike[Date]] = Array.empty ): FeatureLike[OPVector] = { - // vectorize DateList f.toDateList().vectorize(dateListPivot = dateListPivot, referenceDate = referenceDate, trackNulls = trackNulls, others = others.map(_.toDateList())) } diff --git a/core/src/main/scala/com/salesforce/op/dsl/RichFeaturesCollection.scala b/core/src/main/scala/com/salesforce/op/dsl/RichFeaturesCollection.scala index 6fbc2c9da5..e621cc76eb 100644 --- a/core/src/main/scala/com/salesforce/op/dsl/RichFeaturesCollection.scala +++ b/core/src/main/scala/com/salesforce/op/dsl/RichFeaturesCollection.scala @@ -64,18 +64,20 @@ trait RichFeaturesCollection { * Convert features into a single vector feature using the feature engineering steps most likely to provide * good results based on the types of the individual features passed in * + * @param label optional label feature to be passed into stages that require the label column * @return vector feature */ - def transmogrify(): FeatureLike[OPVector] = - Transmogrifier.transmogrify(features.toSeq)(TransmogrifierDefaults).combine() + def transmogrify(label: Option[FeatureLike[RealNN]] = None): FeatureLike[OPVector] = + Transmogrifier.transmogrify(features = features.toSeq, label = label)(TransmogrifierDefaults).combine() /** * Convert features into a single vector feature using the feature engineering steps most likely to provide - * good results based on the types of the individual features passed in. + * good results based on the types of the individual features passed in * + * @param label optional label feature to be passed into stages that require the label column * @return vector feature */ - def autoTransform(): FeatureLike[OPVector] = transmogrify() + def autoTransform(label: Option[FeatureLike[RealNN]] = None): FeatureLike[OPVector] = transmogrify(label = label) } diff --git a/core/src/main/scala/com/salesforce/op/dsl/RichMapFeature.scala b/core/src/main/scala/com/salesforce/op/dsl/RichMapFeature.scala index 0661ef6077..07e3e2c96d 100644 --- a/core/src/main/scala/com/salesforce/op/dsl/RichMapFeature.scala +++ b/core/src/main/scala/com/salesforce/op/dsl/RichMapFeature.scala @@ -252,6 +252,7 @@ trait RichMapFeature { * @param autoDetectThreshold Language detection threshold. If none of the detected languages have * confidence greater than the threshold then defaultLanguage is used. * @param forceSharedHashSpace force the hash space to be shared among all included features + * @param hashSpaceStrategy strategy to determine whether to use shared hash space for all included features * @param defaultLanguage default language to assume in case autoDetectLanguage is disabled or * failed to make a good enough prediction. * @param hashAlgorithm hash algorithm to use @@ -276,6 +277,7 @@ trait RichMapFeature { prependFeatureName: Boolean = TransmogrifierDefaults.PrependFeatureName, autoDetectThreshold: Double = TextTokenizer.AutoDetectThreshold, forceSharedHashSpace: Boolean = false, + hashSpaceStrategy: HashSpaceStrategy = TransmogrifierDefaults.HashSpaceStrategy, defaultLanguage: Language = TextTokenizer.DefaultLanguage, hashAlgorithm: HashAlgorithm = TransmogrifierDefaults.HashAlgorithm, others: Array[FeatureLike[TextMap]] = Array.empty @@ -298,6 +300,7 @@ trait RichMapFeature { .setHashWithIndex(hashWithIndex) .setPrependFeatureName(prependFeatureName) .setForceSharedHashSpace(forceSharedHashSpace) + .setHashSpaceStrategy(hashSpaceStrategy) .setHashAlgorithm(hashAlgorithm) .setBinaryFreq(binaryFreq) .getOutput() @@ -381,6 +384,7 @@ trait RichMapFeature { * @param autoDetectThreshold Language detection threshold. If none of the detected languages have * confidence greater than the threshold then defaultLanguage is used. * @param forceSharedHashSpace force the hash space to be shared among all included features + * @param hashSpaceStrategy strategy to determine whether to use shared hash space for all included features * @param defaultLanguage default language to assume in case autoDetectLanguage is disabled or * failed to make a good enough prediction. * @param hashAlgorithm hash algorithm to use @@ -405,6 +409,7 @@ trait RichMapFeature { prependFeatureName: Boolean = TransmogrifierDefaults.PrependFeatureName, autoDetectThreshold: Double = TextTokenizer.AutoDetectThreshold, forceSharedHashSpace: Boolean = false, + hashSpaceStrategy: HashSpaceStrategy = TransmogrifierDefaults.HashSpaceStrategy, defaultLanguage: Language = TextTokenizer.DefaultLanguage, hashAlgorithm: HashAlgorithm = TransmogrifierDefaults.HashAlgorithm, others: Array[FeatureLike[TextAreaMap]] = Array.empty @@ -427,6 +432,7 @@ trait RichMapFeature { .setHashWithIndex(hashWithIndex) .setPrependFeatureName(prependFeatureName) .setForceSharedHashSpace(forceSharedHashSpace) + .setHashSpaceStrategy(hashSpaceStrategy) .setHashAlgorithm(hashAlgorithm) .setBinaryFreq(binaryFreq) .getOutput() @@ -518,7 +524,7 @@ trait RichMapFeature { } /** - * Apply RealMapVectorizer on any OPMap that has double values + * Apply RealMapVectorizer or auto bucketizer (when label is present) on any OPMap that has double values * * @param others other features of the same type * @param defaultValue value to give missing keys on pivot @@ -526,6 +532,10 @@ trait RichMapFeature { * @param whiteListKeys keys to whitelist * @param blackListKeys keys to blacklist * @param trackNulls option to keep track of values that were missing + * @param label optional label column to be passed into autoBucketizer if present + * @param trackInvalid option to keep track of invalid values, + * eg. NaN, -/+Inf or values that fall outside the buckets + * @param minInfoGain minimum info gain, one of the stopping criteria of the Decision Tree * * @return an OPVector feature */ @@ -536,17 +546,29 @@ trait RichMapFeature { whiteListKeys: Array[String] = Array.empty, blackListKeys: Array[String] = Array.empty, others: Array[FeatureLike[T]] = Array.empty, - trackNulls: Boolean = TransmogrifierDefaults.TrackNulls + trackNulls: Boolean = TransmogrifierDefaults.TrackNulls, + trackInvalid: Boolean = TransmogrifierDefaults.TrackInvalid, + minInfoGain: Double = TransmogrifierDefaults.MinInfoGain, + label: Option[FeatureLike[RealNN]] = None ): FeatureLike[OPVector] = { - new RealMapVectorizer[T]() - .setInput(f +: others) - .setFillWithMean(fillWithMean) - .setDefaultValue(defaultValue) - .setCleanKeys(cleanKeys) - .setWhiteListKeys(whiteListKeys) - .setBlackListKeys(blackListKeys) - .setTrackNulls(trackNulls) - .getOutput() + label match { + case None => + new RealMapVectorizer[T]() + .setInput(f +: others) + .setFillWithMean(fillWithMean) + .setDefaultValue(defaultValue) + .setCleanKeys(cleanKeys) + .setWhiteListKeys(whiteListKeys) + .setBlackListKeys(blackListKeys) + .setTrackNulls(trackNulls) + .getOutput() + case Some(lbl) => + autoBucketize( + label = lbl, trackNulls = trackNulls, trackInvalid = trackInvalid, + minInfoGain = minInfoGain, cleanKeys = cleanKeys, + whiteListKeys = whiteListKeys, blackListKeys = blackListKeys + ) + } } } @@ -590,7 +612,7 @@ trait RichMapFeature { } /** - * Apply IntegralMapVectorizer on any OPMap that has long values + * Apply IntegralMapVectorizer or auto bucketizer (when label is present) on any OPMap that has long values * * @param others other features of the same type * @param defaultValue value to give missing keys on pivot @@ -598,6 +620,10 @@ trait RichMapFeature { * @param whiteListKeys keys to whitelist * @param blackListKeys keys to blacklist * @param trackNulls option to keep track of values that were missing + * @param label optional label column to be passed into autoBucketizer if present + * @param trackInvalid option to keep track of invalid values, + * eg. NaN, -/+Inf or values that fall outside the buckets + * @param minInfoGain minimum info gain, one of the stopping criteria of the Decision Tree * * @return an OPVector feature */ @@ -608,17 +634,29 @@ trait RichMapFeature { whiteListKeys: Array[String] = Array.empty, blackListKeys: Array[String] = Array.empty, others: Array[FeatureLike[T]] = Array.empty, - trackNulls: Boolean = TransmogrifierDefaults.TrackNulls + trackNulls: Boolean = TransmogrifierDefaults.TrackNulls, + trackInvalid: Boolean = TransmogrifierDefaults.TrackInvalid, + minInfoGain: Double = TransmogrifierDefaults.MinInfoGain, + label: Option[FeatureLike[RealNN]] = None ): FeatureLike[OPVector] = { - new IntegralMapVectorizer[T]() - .setInput(f +: others) - .setFillWithMode(fillWithMode) - .setDefaultValue(defaultValue) - .setCleanKeys(cleanKeys) - .setWhiteListKeys(whiteListKeys) - .setBlackListKeys(blackListKeys) - .setTrackNulls(trackNulls) - .getOutput() + label match { + case None => + new IntegralMapVectorizer[T]() + .setInput(f +: others) + .setFillWithMode(fillWithMode) + .setDefaultValue(defaultValue) + .setCleanKeys(cleanKeys) + .setWhiteListKeys(whiteListKeys) + .setBlackListKeys(blackListKeys) + .setTrackNulls(trackNulls) + .getOutput() + case Some(lbl) => + autoBucketize( + label = lbl, trackNulls = trackNulls, trackInvalid = trackInvalid, + minInfoGain = minInfoGain, cleanKeys = cleanKeys, + whiteListKeys = whiteListKeys, blackListKeys = blackListKeys + ) + } } } diff --git a/core/src/main/scala/com/salesforce/op/dsl/RichNumericFeature.scala b/core/src/main/scala/com/salesforce/op/dsl/RichNumericFeature.scala index 95378c8ca2..539dfac834 100644 --- a/core/src/main/scala/com/salesforce/op/dsl/RichNumericFeature.scala +++ b/core/src/main/scala/com/salesforce/op/dsl/RichNumericFeature.scala @@ -316,22 +316,37 @@ trait RichNumericFeature { * @param fillValue value to pull in place of nulls * @param trackNulls keep tract of when nulls occur by adding a second column to the vector with a null indicator * @param fillWithMean replace missing values with mean (as apposed to constant provided in fillValue) - * @return + * @param trackInvalid option to keep track of invalid values, + * eg. NaN, -/+Inf or values that fall outside the buckets + * @param minInfoGain minimum info gain, one of the stopping criteria of the Decision Tree for the autoBucketizer + * @param label optional label column to be passed into autoBucketizer if present + * @return a vector feature containing the raw Features with filled missing values and the bucketized + * features if a label argument is passed */ def vectorize ( fillValue: Double, fillWithMean: Boolean, trackNulls: Boolean, - others: Array[FeatureLike[T]] = Array.empty + others: Array[FeatureLike[T]] = Array.empty, + trackInvalid: Boolean = TransmogrifierDefaults.TrackInvalid, + minInfoGain: Double = TransmogrifierDefaults.MinInfoGain, + label: Option[FeatureLike[RealNN]] = None ): FeatureLike[OPVector] = { - val stage = new RealVectorizer[T]() - .setInput(f +: others) - .setTrackNulls(trackNulls) + val features = f +: others + val stage = new RealVectorizer[T]().setInput(features).setTrackNulls(trackNulls) if (fillWithMean) stage.setFillWithMean else stage.setFillWithConstant(fillValue) - stage.getOutput() + val filledValues = stage.getOutput() + label match { + case None => + filledValues + case Some(lbl) => + val bucketized = features.map( + _.autoBucketize(label = lbl, trackNulls = false, trackInvalid = trackInvalid, minInfoGain = minInfoGain) + ) + new VectorsCombiner().setInput(filledValues +: bucketized).getOutput() + } } - } @@ -418,6 +433,10 @@ trait RichNumericFeature { * @param minVariance Minimum amount of variance allowed for each feature and label * @param removeBadFeatures If set to true, this will automatically remove all the bad features * from the feature vector + * @param removeFeatureGroup remove all features descended from a parent feature + * @param protectTextSharedHash protect text shared hash from related null indicators and other hashes + * @param categoricalLabel If true, treat label as categorical. If not set, check number of disticnt labels to + * decide whether a label should be treated categorical. * @return sanity checked feature vector */ // scalastyle:off @@ -434,6 +453,7 @@ trait RichNumericFeature { minVariance: Double = SanityChecker.MinVariance, removeBadFeatures: Boolean = SanityChecker.RemoveBadFeatures, removeFeatureGroup: Boolean = SanityChecker.RemoveFeatureGroup, + protectTextSharedHash: Boolean = SanityChecker.ProtectTextSharedHash, categoricalLabel: Option[Boolean] = None ): FeatureLike[OPVector] = { // scalastyle:on @@ -449,6 +469,7 @@ trait RichNumericFeature { .setMinVariance(minVariance) .setRemoveBadFeatures(removeBadFeatures) .setRemoveFeatureGroup(removeFeatureGroup) + .setProtectTextSharedHash(protectTextSharedHash) .setInput(f, featureVector) categoricalLabel.foreach(checker.setCategoricalLabel) @@ -578,18 +599,36 @@ trait RichNumericFeature { * @param fillValue value to pull in place of nulls * @param trackNulls keep tract of when nulls occur by adding a second column to the vector with a null indicator * @param fillWithMode replace missing values with mode (as apposed to constant provided in fillValue) - * @return + * @param trackInvalid option to keep track of invalid values, + * eg. NaN, -/+Inf or values that fall outside the buckets + * @param minInfoGain minimum info gain, one of the stopping criteria of the Decision Tree for the autoBucketizer + * @param label optional label column to be passed into autoBucketizer if present + * @return a vector feature containing the raw Features with filled missing values and the bucketized + * features if a label argument is passed */ def vectorize ( fillValue: Long, fillWithMode: Boolean, trackNulls: Boolean, - others: Array[FeatureLike[T]] = Array.empty + others: Array[FeatureLike[T]] = Array.empty, + trackInvalid: Boolean = TransmogrifierDefaults.TrackInvalid, + minInfoGain: Double = TransmogrifierDefaults.MinInfoGain, + label: Option[FeatureLike[RealNN]] = None ): FeatureLike[OPVector] = { - val stage = new IntegralVectorizer().setInput(f +: others).setTrackNulls(trackNulls) + val features = f +: others + val stage = new IntegralVectorizer[T]().setInput(features).setTrackNulls(trackNulls) if (fillWithMode) stage.setFillWithMode else stage.setFillWithConstant(fillValue) - stage.getOutput() + val filledValues = stage.getOutput() + label match { + case None => + filledValues + case Some(lbl) => + val bucketized = features.map( + _.autoBucketize(label = lbl, trackNulls = false, trackInvalid = trackInvalid, minInfoGain = minInfoGain) + ) + new VectorsCombiner().setInput(filledValues +: bucketized).getOutput() + } } } diff --git a/core/src/main/scala/com/salesforce/op/dsl/RichTextFeature.scala b/core/src/main/scala/com/salesforce/op/dsl/RichTextFeature.scala index baf25c1e07..46f4a91bf3 100644 --- a/core/src/main/scala/com/salesforce/op/dsl/RichTextFeature.scala +++ b/core/src/main/scala/com/salesforce/op/dsl/RichTextFeature.scala @@ -191,6 +191,7 @@ trait RichTextFeature { * @param autoDetectThreshold Language detection threshold. If none of the detected languages have * confidence greater than the threshold then defaultLanguage is used. * @param forceSharedHashSpace force the hash space to be shared among all included features + * @param hashSpaceStrategy strategy to determine whether to use shared hash space for all included features * @param defaultLanguage default language to assume in case autoDetectLanguage is disabled or * failed to make a good enough prediction. * @param hashAlgorithm hash algorithm to use @@ -215,6 +216,7 @@ trait RichTextFeature { prependFeatureName: Boolean = TransmogrifierDefaults.PrependFeatureName, autoDetectThreshold: Double = TextTokenizer.AutoDetectThreshold, forceSharedHashSpace: Boolean = false, + hashSpaceStrategy: HashSpaceStrategy = TransmogrifierDefaults.HashSpaceStrategy, defaultLanguage: Language = TextTokenizer.DefaultLanguage, hashAlgorithm: HashAlgorithm = TransmogrifierDefaults.HashAlgorithm, others: Array[FeatureLike[T]] = Array.empty @@ -237,6 +239,7 @@ trait RichTextFeature { .setHashWithIndex(hashWithIndex) .setPrependFeatureName(prependFeatureName) .setForceSharedHashSpace(forceSharedHashSpace) + .setHashSpaceStrategy(hashSpaceStrategy) .setHashAlgorithm(hashAlgorithm) .setBinaryFreq(binaryFreq) .getOutput() @@ -249,8 +252,8 @@ trait RichTextFeature { * The indices are in [0, numLabels), ordered by label frequencies. * So the most frequent label gets index 0. * - * @param unseenName name to give strings that appear in transform but not in fit - * @param handleInvalid how to transform values not seen in fitting + * @param unseenName name to give strings that appear in transform but not in fit + * @param handleInvalid how to transform values not seen in fitting * @see [[OpIndexToString]] for the inverse transformation * * @return indexed real feature @@ -263,7 +266,7 @@ trait RichTextFeature { case StringIndexerHandleInvalid.NoFilter => f.transformWith( new OpStringIndexerNoFilter[T]().setUnseenName(unseenName) ) - case _ => f.transformWith( new OpStringIndexer[T]().setHandleInvalid(handleInvalid) ) + case _ => f.transformWith(new OpStringIndexer[T]().setHandleInvalid(handleInvalid)) } } @@ -273,10 +276,10 @@ trait RichTextFeature { * @param languageDetector a language detector instance * @param analyzer a text analyzer instance * @param autoDetectLanguage indicates whether to attempt language detection - * @param defaultLanguage default language to assume in case autoDetectLanguage is disabled or - * failed to make a good enough prediction. * @param autoDetectThreshold Language detection threshold. If none of the detected languages have * confidence greater than the threshold then defaultLanguage is used. + * @param defaultLanguage default language to assume in case autoDetectLanguage is disabled or + * failed to make a good enough prediction. * @param minTokenLength minimum token length, >= 1. * @param toLowercase indicates whether to convert all characters to lowercase before analyzing * @return tokenized feature @@ -303,10 +306,10 @@ trait RichTextFeature { * Tokenize text using [[LuceneTextAnalyzer]] with [[OptimaizeLanguageDetector]] * * @param autoDetectLanguage indicates whether to attempt language detection - * @param defaultLanguage a language to assume in case no language was detected or - * when autoDetectLanguage is set to false * @param autoDetectThreshold Language detection threshold. If none of the detected languages have * confidence greater than the threshold then defaultLanguage is used. + * @param defaultLanguage default language to assume in case autoDetectLanguage is disabled or + * failed to make a good enough prediction. * @param minTokenLength minimum token length, >= 1. * @param toLowercase indicates whether to convert all characters to lowercase before analyzing * @param stripHtml indicates whether to strip HTML tags from the text or not before analyzing @@ -379,6 +382,37 @@ trait RichTextFeature { def detectLanguages(languageDetector: LanguageDetector = LangDetector.DefaultDetector): FeatureLike[RealMap] = f.transformWith(new LangDetector[T](languageDetector)) + /** + * Find name entities of the text using OpenNLP [[OpenNLPAnalyzer]] + * + * @param languageDetector a language detector instance + * @param analyzer a text analyzer instance + * @param sentenceSplitter sentence splitter + * @param tagger name entity recognition tagger + * @param autoDetectLanguage indicates whether to attempt language detection + * @param autoDetectThreshold Language detection threshold. If none of the detected languages have + * confidence greater than the threshold then defaultLanguage is used. + * @param defaultLanguage default language to assume in case autoDetectLanguage is disabled or + * failed to make a good enough prediction. + * @return name entity sets feature + */ + def recognizeEntities + ( + languageDetector: LanguageDetector = NameEntityRecognizer.LanguageDetector, + analyzer: TextAnalyzer = NameEntityRecognizer.Analyzer, + sentenceSplitter: SentenceSplitter = NameEntityRecognizer.Splitter, + tagger: NameEntityTagger[_ <: TaggerResult] = NameEntityRecognizer.Tagger, + autoDetectLanguage: Boolean = NameEntityRecognizer.AutoDetectLanguage, + autoDetectThreshold: Double = NameEntityRecognizer.AutoDetectThreshold, + defaultLanguage: Language = NameEntityRecognizer.DefaultLanguage + ): FeatureLike[MultiPickListMap] = { + f.transformWith( + new NameEntityRecognizer[T](languageDetector, analyzer, sentenceSplitter, tagger) + .setAutoDetectLanguage(autoDetectLanguage) + .setAutoDetectThreshold(autoDetectThreshold) + .setDefaultLanguage(defaultLanguage) + ) + } } implicit class RichPhoneFeature(val f: FeatureLike[Phone]) { @@ -480,10 +514,10 @@ trait RichTextFeature { * 0 if invalid and with an optional second element idicating if the phone number was null * * @param defaultRegion region against which to check phone validity - * @param isStrict strict validation means cannot have extra digits - * @param trackNulls produce column indicating if the number was null - * @param fillValue value to fill in for nulls in vactor creation - * @param others other phone numbers to vectorize + * @param isStrict strict validation means cannot have extra digits + * @param trackNulls produce column indicating if the number was null + * @param fillValue value to fill in for nulls in vactor creation + * @param others other phone numbers to vectorize * @return vector feature containing information about phone number */ def vectorize( @@ -503,12 +537,14 @@ trait RichTextFeature { /** * Extract email prefixes + * * @return email prefix */ def toEmailPrefix: FeatureLike[Text] = f.map[Text](_.prefix.toText, "prefix") /** * Extract email domains + * * @return email domain */ def toEmailDomain: FeatureLike[Text] = f.map[Text](_.domain.toText, "domain") @@ -518,10 +554,10 @@ trait RichTextFeature { * and keeping the top K occurrences of each feature, along with an extra column per feature * indicating how many values were not in the top K. * - * @param others Other [[Email]] features - * @param topK How many values to keep in the vector + * @param others Other [[Email]] features + * @param topK How many values to keep in the vector * @param minSupport Min times a value must occur to be retained in pivot - * @param cleanText If true, ignores capitalization and punctuations when grouping categories + * @param cleanText If true, ignores capitalization and punctuations when grouping categories * @param trackNulls keep an extra column that indicated if feature was null * @return The vectorized features */ @@ -563,6 +599,7 @@ trait RichTextFeature { /** * Verifies if the url is of correct form of "Uniform Resource Identifiers (URI): Generic Syntax" * RFC2396 (http://www.ietf.org/rfc/rfc2396.txt) + * * @param protocols url protocols to consider valid, i.e. http, https, ftp etc. */ def isValidUrl(protocols: Array[String]): FeatureLike[Binary] = f.exists(_.isValid(protocols)) @@ -572,10 +609,10 @@ trait RichTextFeature { * and keeping the top K occurrences of each feature, along with an extra column per feature * indicating how many values were not in the top K. * - * @param others Other [[URL]] features - * @param topK How many values to keep in the vector + * @param others Other [[URL]] features + * @param topK How many values to keep in the vector * @param minSupport Min times a value must occur to be retained in pivot - * @param cleanText If true, ignores capitalization and punctuations when grouping categories + * @param cleanText If true, ignores capitalization and punctuations when grouping categories * @param trackNulls keep an extra column that indicated if feature was null * @return The vectorized features */ @@ -613,12 +650,12 @@ trait RichTextFeature { * Extracts Base64 features (MIME type etc.), * then converts those into PickList features and vectorizes them. * - * @param topK number of values to keep for each key - * @param minSupport min times a value must occur to be retained in pivot - * @param cleanText clean text before pivoting + * @param topK number of values to keep for each key + * @param minSupport min times a value must occur to be retained in pivot + * @param cleanText clean text before pivoting * @param trackNulls keep an extra column that indicated if feature was null - * @param typeHint MIME type hint, i.e. 'application/json', 'text/plain' etc. - * @param others other features of the same type + * @param typeHint MIME type hint, i.e. 'application/json', 'text/plain' etc. + * @param others other features of the same type * @return result feature of type vector */ def vectorize( @@ -646,10 +683,10 @@ trait RichTextFeature { * Converts a sequence of [[PickList]] features into a vector keeping the top K occurrences of each feature, * along with an extra column per feature indicating how many values were not in the top K. * - * @param others Other [[PickList]] features to include in pivot - * @param topK How many values to keep in the vector + * @param others Other [[PickList]] features to include in pivot + * @param topK How many values to keep in the vector * @param minSupport Min times a value must occur to be retained in pivot - * @param cleanText If true, ignores capitalization and punctuations when grouping categories + * @param cleanText If true, ignores capitalization and punctuations when grouping categories * @param trackNulls keep an extra column that indicated if feature was null * @return The vectorized features */ @@ -672,10 +709,10 @@ trait RichTextFeature { * Converts a sequence of [[ComboBox]] features into a vector keeping the top K occurrences of each feature, * along with an extra column per feature indicating how many values were not in the top K. * - * @param others Other [[ComboBox]] features to include in pivot - * @param topK How many values to keep in the vector + * @param others Other [[ComboBox]] features to include in pivot + * @param topK How many values to keep in the vector * @param minSupport Min times a value must occur to be retained in pivot - * @param cleanText If true, ignores capitalization and punctuations when grouping categories + * @param cleanText If true, ignores capitalization and punctuations when grouping categories * @param trackNulls keep an extra column that indicated if feature was null * @return The vectorized features */ @@ -698,10 +735,10 @@ trait RichTextFeature { * Converts a sequence of [[ID]] features into a vector keeping the top K occurrences of each feature, * along with an extra column per feature indicating how many values were not in the top K. * - * @param others Other [[ID]] features to include in pivot - * @param topK How many values to keep in the vector + * @param others Other [[ID]] features to include in pivot + * @param topK How many values to keep in the vector * @param minSupport Min times a value must occur to be retained in pivot - * @param cleanText If true, ignores capitalization and punctuations when grouping categories + * @param cleanText If true, ignores capitalization and punctuations when grouping categories * @param trackNulls keep an extra column that indicated if feature was null * @return The vectorized features */ diff --git a/core/src/main/scala/com/salesforce/op/dsl/RichVectorFeature.scala b/core/src/main/scala/com/salesforce/op/dsl/RichVectorFeature.scala index f10bc18bce..84bb9cbe81 100644 --- a/core/src/main/scala/com/salesforce/op/dsl/RichVectorFeature.scala +++ b/core/src/main/scala/com/salesforce/op/dsl/RichVectorFeature.scala @@ -34,7 +34,7 @@ package com.salesforce.op.dsl import com.salesforce.op.UID import com.salesforce.op.features.FeatureLike import com.salesforce.op.features.types._ -import com.salesforce.op.stages.impl.classification.{Impurity, OpRandomForest} +import com.salesforce.op.stages.impl.classification.{Impurity, OpRandomForestClassifier} import com.salesforce.op.stages.impl.feature.{DropIndicesByTransformer, OpLDA} import com.salesforce.op.stages.sparkwrappers.specific.OpEstimatorWrapper import com.salesforce.op.utils.spark.{OpVectorColumnMetadata, OpVectorMetadata} @@ -86,8 +86,8 @@ trait RichVectorFeature { impurity: Impurity = Impurity.Entropy, seed: Long = util.Random.nextLong, thresholds: Array[Double] = Array.empty - ): (FeatureLike[RealNN], FeatureLike[OPVector], FeatureLike[OPVector]) = { - val OpRF = new OpRandomForest().setInput(label, f) + ): (FeatureLike[Prediction]) = { + val OpRF = new OpRandomForestClassifier().setInput(label, f) if (thresholds.nonEmpty) OpRF.setThresholds(thresholds) OpRF.setMaxDepth(maxDepth) @@ -96,7 +96,7 @@ trait RichVectorFeature { .setMinInfoGain(minInfoGain) .setSubsamplingRate(subSamplingRate) .setNumTrees(numTrees) - .setImpurity(impurity) + .setImpurity(impurity.sparkName) .setSeed(seed) .getOutput() } diff --git a/core/src/main/scala/com/salesforce/op/evaluators/OpBinaryClassificationEvaluator.scala b/core/src/main/scala/com/salesforce/op/evaluators/OpBinaryClassificationEvaluator.scala index fb8c0ba3e2..378cef5634 100644 --- a/core/src/main/scala/com/salesforce/op/evaluators/OpBinaryClassificationEvaluator.scala +++ b/core/src/main/scala/com/salesforce/op/evaluators/OpBinaryClassificationEvaluator.scala @@ -33,8 +33,12 @@ package com.salesforce.op.evaluators import com.salesforce.op.UID import org.apache.spark.ml.evaluation.{BinaryClassificationEvaluator, MulticlassClassificationEvaluator} +import org.apache.spark.ml.linalg.Vector import org.apache.spark.mllib.evaluation.MulticlassMetrics -import org.apache.spark.sql.Dataset +import org.apache.spark.mllib.evaluation.{BinaryClassificationMetrics => SparkMLBinaryClassificationMetrics} +import org.apache.spark.sql.functions.col +import org.apache.spark.sql.{Dataset, Row} +import org.apache.spark.sql.types.DoubleType import org.slf4j.LoggerFactory /** @@ -52,7 +56,8 @@ private[op] class OpBinaryClassificationEvaluator ( override val name: String = OpEvaluatorNames.binary, override val isLargerBetter: Boolean = true, - override val uid: String = UID[OpBinaryClassificationEvaluator] + override val uid: String = UID[OpBinaryClassificationEvaluator], + val numBins: Int = 100 ) extends OpBinaryClassificationEvaluatorBase[BinaryClassificationMetrics](uid = uid) { @transient private lazy val log = LoggerFactory.getLogger(this.getClass) @@ -60,22 +65,21 @@ private[op] class OpBinaryClassificationEvaluator def getDefaultMetric: BinaryClassificationMetrics => Double = _.AuROC override def evaluateAll(data: Dataset[_]): BinaryClassificationMetrics = { - val (labelColName, rawPredictionColName, predictionColName) = (getLabelCol, getRawPredictionCol, getPredictionCol) + val (labelColName, rawPredictionColName, predictionColName, probabilityColName) = + (getLabelCol, getRawPredictionCol, getPredictionCol, getProbabilityCol) log.debug( - "Evaluating metrics on columns :\n label : {}\n rawPrediction : {}\n prediction : {}\n", - labelColName, rawPredictionColName, predictionColName + "Evaluating metrics on columns :\n label : {}\n rawPrediction : {}\n prediction : {}\n probability : {}\n", + labelColName, rawPredictionColName, predictionColName, probabilityColName ) - val Array(aUROC, aUPR) = - Array(BinaryClassEvalMetrics.AuROC, BinaryClassEvalMetrics.AuPR).map(getBinaryEvaluatorMetric(_, data)) - import data.sparkSession.implicits._ val rdd = data.select(predictionColName, labelColName).as[(Double, Double)].rdd if (rdd.isEmpty()) { log.error("The dataset is empty") - BinaryClassificationMetrics(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0) + BinaryClassificationMetrics(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + Seq(), Seq(), Seq(), Seq()) } else { val multiclassMetrics = new MulticlassMetrics(rdd) val labels = multiclassMetrics.labels @@ -94,11 +98,23 @@ private[op] class OpBinaryClassificationEvaluator val f1 = if (precision + recall == 0.0) 0.0 else 2 * precision * recall / (precision + recall) val error = if (tp + fp + tn + fn == 0.0) 0.0 else (fp + fn) / (tp + fp + tn + fn) + val scoreAndLabels = + data.select(col(probabilityColName), col(labelColName).cast(DoubleType)).rdd.map { + case Row(prob: Vector, label: Double) => (prob(1), label) + case Row(prob: Double, label: Double) => (prob, label) + } + val sparkMLMetrics = new SparkMLBinaryClassificationMetrics(scoreAndLabels = scoreAndLabels, numBins = numBins) + val thresholds = sparkMLMetrics.thresholds().collect() + val precisionByThreshold = sparkMLMetrics.precisionByThreshold().collect().map(_._2) + val recallByThreshold = sparkMLMetrics.recallByThreshold().collect().map(_._2) + val falsePositiveRateByThreshold = sparkMLMetrics.roc().collect().map(_._1).slice(1, thresholds.length + 1) + val aUROC = sparkMLMetrics.areaUnderROC() + val aUPR = sparkMLMetrics.areaUnderPR() val metrics = BinaryClassificationMetrics( Precision = precision, Recall = recall, F1 = f1, AuROC = aUROC, - AuPR = aUPR, Error = error, TP = tp, TN = tn, FP = fp, FN = fn + AuPR = aUPR, Error = error, TP = tp, TN = tn, FP = fp, FN = fn, + thresholds, precisionByThreshold, recallByThreshold, falsePositiveRateByThreshold ) - log.info("Evaluated metrics: {}", metrics.toString) metrics } @@ -147,5 +163,9 @@ case class BinaryClassificationMetrics TP: Double, TN: Double, FP: Double, - FN: Double + FN: Double, + thresholds: Seq[Double], + precisionByThreshold: Seq[Double], + recallByThreshold: Seq[Double], + falsePositiveRateByThreshold: Seq[Double] ) extends EvaluationMetrics diff --git a/core/src/main/scala/com/salesforce/op/evaluators/OpEvaluatorBase.scala b/core/src/main/scala/com/salesforce/op/evaluators/OpEvaluatorBase.scala index 1ce904bccd..9fbe841ecd 100644 --- a/core/src/main/scala/com/salesforce/op/evaluators/OpEvaluatorBase.scala +++ b/core/src/main/scala/com/salesforce/op/evaluators/OpEvaluatorBase.scala @@ -118,7 +118,6 @@ trait EvaluationMetrics extends JsonLike { * @return metadata */ def toMetadata: Metadata = this.toMap.toMetadata - } @@ -205,6 +204,10 @@ sealed abstract class ClassificationEvalMetric(val sparkEntryName: String) exten */ object BinaryClassEvalMetrics extends Enum[ClassificationEvalMetric] { val values = findValues + case object Precision extends ClassificationEvalMetric("precision") + case object Recall extends ClassificationEvalMetric("recall") + case object F1 extends ClassificationEvalMetric("f1") + case object Error extends ClassificationEvalMetric("accuracy") case object AuROC extends ClassificationEvalMetric("areaUnderROC") case object AuPR extends ClassificationEvalMetric("areaUnderPR") } @@ -218,8 +221,10 @@ object MultiClassEvalMetrics extends Enum[ClassificationEvalMetric] { case object Recall extends ClassificationEvalMetric("weightedRecall") case object F1 extends ClassificationEvalMetric("f1") case object Error extends ClassificationEvalMetric("accuracy") + case object ThresholdMetrics extends ClassificationEvalMetric("thresholdMetrics") } + /** * Contains the names of metrics used in logging */ diff --git a/core/src/main/scala/com/salesforce/op/evaluators/OpMultiClassificationEvaluator.scala b/core/src/main/scala/com/salesforce/op/evaluators/OpMultiClassificationEvaluator.scala index 6f7a60c9dc..07cfb0a92b 100644 --- a/core/src/main/scala/com/salesforce/op/evaluators/OpMultiClassificationEvaluator.scala +++ b/core/src/main/scala/com/salesforce/op/evaluators/OpMultiClassificationEvaluator.scala @@ -32,21 +32,24 @@ package com.salesforce.op.evaluators import com.salesforce.op.UID -import com.salesforce.op.features.types._ +import com.twitter.algebird.Monoid._ +import com.twitter.algebird.Operators._ +import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator +import org.apache.spark.ml.linalg.Vector +import org.apache.spark.ml.param.{DoubleArrayParam, IntArrayParam} import org.apache.spark.mllib.evaluation.MulticlassMetrics +import org.apache.spark.rdd.RDD import org.apache.spark.sql.Dataset import org.slf4j.LoggerFactory -import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator /** - * * Instance to evaluate Multi Classification metrics * The metrics are Precision, Recall, F1 and Error Rate * Default evaluation returns F1 score * - * @param name name of default metric + * @param name name of default metric * @param isLargerBetter is metric better if larger - * @param uid uid for instance + * @param uid uid for instance */ private[op] class OpMultiClassificationEvaluator ( @@ -59,12 +62,33 @@ private[op] class OpMultiClassificationEvaluator def getDefaultMetric: MultiClassificationMetrics => Double = _.F1 + final val topNs = new IntArrayParam( + parent = this, + name = "topNs", + doc = "sequence of topN values to use for threshold metrics", + isValid = _.forall(_ > 0) + ) + setDefault(topNs, Array(1, 3)) + + def setTopNs(v: Array[Int]): this.type = set(topNs, v) + + final val thresholds = new DoubleArrayParam( + parent = this, + name = "thresholds", + doc = "sequence of threshold values (must be in [0.0, 1.0]) to use for threshold metrics", + isValid = _.forall(x => x >= 0.0 && x <= 1.0) + ) + setDefault(thresholds, (0 to 100).map(_ / 100.0).toArray) + + def setThresholds(v: Array[Double]): this.type = set(thresholds, v) + override def evaluateAll(data: Dataset[_]): MultiClassificationMetrics = { - val (labelColName, predictionColName, rawPredictionColName) = (getLabelCol, getPredictionCol, getRawPredictionCol) + val (labelColName, predictionColName, rawPredictionColName, probabilityColName) = (getLabelCol, getPredictionCol, + getRawPredictionCol, getProbabilityCol) log.debug( - "Evaluating metrics on columns :\n label : {}\n rawPrediction : {}\n prediction : {}\n", - labelColName, rawPredictionColName, predictionColName + "Evaluating metrics on columns :\n label : {}\n rawPrediction : {}\n prediction : {}\n probability : {}\n", + labelColName, rawPredictionColName, predictionColName, probabilityColName ) import data.sparkSession.implicits._ @@ -76,13 +100,132 @@ private[op] class OpMultiClassificationEvaluator val recall = multiclassMetrics.weightedRecall val f1 = if (precision + recall == 0.0) 0.0 else 2 * precision * recall / (precision + recall) - val metrics = MultiClassificationMetrics(Precision = precision, Recall = recall, F1 = f1, Error = error) + val thresholdMetrics = calculateThresholdMetrics( + data = data.select(probabilityColName, labelColName).rdd.map(r => (r.getAs[Vector](0).toArray, r.getDouble(1))), + topNs = $(topNs), + thresholds = $(thresholds) + ) + + val metrics = MultiClassificationMetrics( + Precision = precision, + Recall = recall, + F1 = f1, + Error = error, + ThresholdMetrics = thresholdMetrics + ) log.info("Evaluated metrics: {}", metrics.toString) metrics } + /** + * Function that calculates a set of threshold metrics for different topN values given an RDD of scores & labels, + * a list of topN values to consider, and a list of thresholds to use. + * + * Output: ThresholdMetrics object, containing thresholds used, topN values used, and maps from topN value to + * arrays of correct, incorrect, and no prediction counts at each threshold. Summing all three of these arrays + * together should give an array where each entry the total number of rows in the input RDD. + * + * @param data Input RDD consisting of (vector of score probabilities, label), where label corresponds to the + * index of the true class and the score vector consists of probabilities for each class + * @param topNs Sequence of topN values to calculate threshold metrics for. + * For example, if topN is Seq(1, 3, 10) then threshold metrics are calculated by considering if + * the score of the true class is in the top 1, top 3, and top10 scores, respectively. If a topN + * value is greater than the number of total classes, + * then it will still be applied, but will have the same results as if that topN value = num classes + * @param thresholds Sequence of threshold values applied to predicted probabilities, therefore they must be in the + * range [0.0, 1.0] + */ + def calculateThresholdMetrics( + data: RDD[(Array[Double], Double)], + topNs: Seq[Int], + thresholds: Seq[Double] + ): ThresholdMetrics = { + require(thresholds.nonEmpty, "thresholds sequence in cannot be empty") + require(thresholds.forall(x => x >= 0 && x <= 1.0), "thresholds sequence elements must be in the range [0, 1]") + require(topNs.nonEmpty, "topN sequence in cannot be empty") + require(topNs.forall(_ > 0), "topN sequence can only contain positive integers") + + type Label = Int + type CorrIncorr = (Array[Long], Array[Long]) + type MetricsMap = Map[Label, CorrIncorr] + + val nThresholds = thresholds.length + + /** + * Allocates an array of longs and fills it with a specified value from start until end + */ + def arrayFill(size: Int)(start: Int, end: Int, value: Long) = { + val res = new Array[Long](size) + var i = start + while (i < end) { + res(i) = value + i += 1 + } + res + } + + /** + * First aggregation step turns an array of scores (as probabilities) and a single label (index of correct class) + * into two arrays, correct and incorrect counts by threshold. Each array index corresponds to whether + * the score counts as correct or incorrect at the threshold corresponding to that index. + */ + def computeMetrics(scoresAndLabels: (Array[Double], Double)): MetricsMap = { + val scores: Array[Double] = scoresAndLabels._1 + val label: Label = scoresAndLabels._2.toInt + val trueClassScore: Double = scores(label) + val topNsAndScores: Map[Label, Array[(Double, Int)]] = topNs.map(t => t -> scores.zipWithIndex.sortBy(-_._1) + .take(t)).toMap + val topNScores: Map[Label, Array[Double]] = topNsAndScores.mapValues(_.map(_._1)) + // Doesn't matter which key you use since the scores are sorted + val topScore: Double = topNScores.head._2.head + val topNIndices: Map[Label, Array[Int]] = topNsAndScores.mapValues(_.map(_._2)) + + // To calculate correct / incorrect counts per threshold, we just need to find the array index where the + // true label score and the top score are no longer >= threshold. + val trueScoreCutoffIndex: Int = { + val idx = thresholds.indexWhere(_ > trueClassScore) + if (idx < 0) nThresholds else idx + } + val maxScoreCutoffIndex: Int = { + val idx = thresholds.indexWhere(_ > topScore) + if (idx < 0) nThresholds else idx + } + topNs.view.map { t => + val correctCounts = if (topNIndices(t).contains(label)) { + arrayFill(nThresholds)(start = 0, end = trueScoreCutoffIndex, value = 1L) + } else new Array[Long](nThresholds) + + val incorrectCounts = if (topNIndices(t).contains(label)) { + arrayFill(nThresholds)(start = trueScoreCutoffIndex, end = maxScoreCutoffIndex, value = 1L) + } else arrayFill(nThresholds)(start = 0, end = maxScoreCutoffIndex, value = 1L) + + t -> (correctCounts, incorrectCounts) + }.toMap[Label, CorrIncorr] + } + + val zeroValue: MetricsMap = + topNs + .map(_ -> (new Array[Long](nThresholds), new Array[Long](nThresholds))) + .toMap[Label, CorrIncorr] + + val agg: MetricsMap = + data.treeAggregate[MetricsMap](zeroValue)(combOp = _ + _, seqOp = _ + computeMetrics(_)) + + val nRows = data.count() + ThresholdMetrics( + topNs = topNs, + thresholds = thresholds, + correctCounts = agg.mapValues { case (cor, _) => cor.toSeq }, + incorrectCounts = agg.mapValues { case (_, incor) => incor.toSeq }, + noPredictionCounts = agg.mapValues { case (cor, incor) => + (Array.fill(nThresholds)(nRows) + cor.map(-_) + incor.map(-_)).toSeq + } + ) + } + + final private[op] def getMultiEvaluatorMetric(metricName: ClassificationEvalMetric, dataset: Dataset[_]): Double = { new MulticlassClassificationEvaluator() .setLabelCol(getLabelCol) @@ -101,6 +244,39 @@ private[op] class OpMultiClassificationEvaluator * @param Recall * @param F1 * @param Error + * @param ThresholdMetrics */ -case class MultiClassificationMetrics(Precision: Double, Recall: Double, F1: Double, Error: Double) - extends EvaluationMetrics +case class MultiClassificationMetrics +( + Precision: Double, + Recall: Double, + F1: Double, + Error: Double, + ThresholdMetrics: ThresholdMetrics +) extends EvaluationMetrics + +/** + * Threshold-based metrics for multiclass classification + * + * Classifications being correct, incorrect, or no classification are defined in terms of the topN and score threshold + * to be: + * Correct - score of the true label is in the top N scores AND the score of the true label is >= threshold + * Incorrect - score of top predicted label >= threshold AND + * (true label NOT in top N predicted labels OR score of true label < threshold) + * No prediction - otherwise (score of top predicted label < threshold) + * + * @param topNs list of topN values (used as keys for the count maps) + * @param thresholds list of threshold values (correspond to thresholds at the indices + * of the arrays in the count maps) + * @param correctCounts map from topN value to an array of counts of correct classifications at each threshold + * @param incorrectCounts map from topN value to an array of counts of incorrect classifications at each threshold + * @param noPredictionCounts map from topN value to an array of counts of no prediction at each threshold + */ +case class ThresholdMetrics +( + topNs: Seq[Int], + thresholds: Seq[Double], + correctCounts: Map[Int, Seq[Long]], + incorrectCounts: Map[Int, Seq[Long]], + noPredictionCounts: Map[Int, Seq[Long]] +) extends EvaluationMetrics diff --git a/core/src/main/scala/org/apache/spark/ml/classification/OpDecisionTreeClassificationModel.scala b/core/src/main/scala/com/salesforce/op/filters/AllFeatureInformation.scala similarity index 61% rename from core/src/main/scala/org/apache/spark/ml/classification/OpDecisionTreeClassificationModel.scala rename to core/src/main/scala/com/salesforce/op/filters/AllFeatureInformation.scala index e039dbd37d..ddb7846a2e 100644 --- a/core/src/main/scala/org/apache/spark/ml/classification/OpDecisionTreeClassificationModel.scala +++ b/core/src/main/scala/com/salesforce/op/filters/AllFeatureInformation.scala @@ -29,25 +29,23 @@ * POSSIBILITY OF SUCH DAMAGE. */ -package org.apache.spark.ml.classification +package com.salesforce.op.filters -import com.salesforce.op.UID -import com.salesforce.op.features.types.{OPVector, Prediction, RealMap, RealNN} -import org.apache.spark.ml.tree.Node - -import scala.reflect.runtime.universe.TypeTag - -class OpDecisionTreeClassificationModel -( - rootNode: Node, - numFeatures: Int, - numClasses: Int, - uid: String = UID[OpDecisionTreeClassificationModel], - val operationName: String = "opDTC" -)( - implicit val tti1: TypeTag[RealNN], - val tti2: TypeTag[OPVector], - val tto: TypeTag[Prediction], - val ttov: TypeTag[Prediction#Value] -) extends DecisionTreeClassificationModel(uid = uid, rootNode = rootNode, numFeatures = numFeatures, - numClasses = numClasses) with OpClassifierModelBase +/** + * Contains all feature distribution summaries and null label-leakage correlations used to + * determine dropped features in [[RawFeatureFilter]]. + * + * @param responseSummaries response summaries + * @param responseDistributions response distributions + * @param predictorSummaries predictor summaries + * @param predictorDistributions predictor distributions + * @param correlationInfo null label-leakage correlation map + * 1st level keys correspond to response keys + * 2nd level keys correspond to predictor keys with values being null-label leakage corr. value + */ +private[op] case class AllFeatureInformation( + responseSummaries: Map[FeatureKey, Summary], + responseDistributions: Array[FeatureDistribution], + predictorSummaries: Map[FeatureKey, Summary], + predictorDistributions: Array[FeatureDistribution], + correlationInfo: Map[FeatureKey, Map[FeatureKey, Double]]) diff --git a/core/src/main/scala/com/salesforce/op/filters/FeatureDistribution.scala b/core/src/main/scala/com/salesforce/op/filters/FeatureDistribution.scala new file mode 100644 index 0000000000..2c0ac16179 --- /dev/null +++ b/core/src/main/scala/com/salesforce/op/filters/FeatureDistribution.scala @@ -0,0 +1,219 @@ +/* + * Copyright (c) 2017, Salesforce.com, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of Salesforce.com nor the names of its contributors may + * be used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +package com.salesforce.op.filters + +import com.salesforce.op.features.TransientFeature +import com.salesforce.op.stages.impl.feature.{Inclusion, NumericBucketizer} +import com.twitter.algebird.Semigroup +import com.twitter.algebird.Monoid._ +import com.twitter.algebird.Operators._ +import org.apache.spark.mllib.feature.HashingTF + +/** + * Class containing summary information for a feature + * + * @param name name of the feature + * @param key map key associated with distribution (when the feature is a map) + * @param count total count of feature seen + * @param nulls number of empties seen in feature + * @param distribution binned counts of feature values (hashed for strings, evently spaced bins for numerics) + * @param summaryInfo either min and max number of tokens for text data, + * or number of splits used for bins for numeric data + */ +case class FeatureDistribution +( + name: String, + key: Option[String], + count: Long, + nulls: Long, + distribution: Array[Double], + summaryInfo: Array[Double] +) { + + /** + * Get feature key associated to this distribution + */ + def featureKey: FeatureKey = (name, key) + + /** + * Check that feature distributions belong to the same feature and key. + * + * @param fd distribution to compare to + */ + def checkMatch(fd: FeatureDistribution): Unit = + assert(name == fd.name && key == fd.key, "Name and key must match to compare or combine FeatureDistribution") + + /** + * Get fill rate of feature + * + * @return fraction of data that is non empty + */ + def fillRate(): Double = if (count == 0L) 0.0 else (count - nulls) / count.toDouble + + /** + * Combine feature distributions + * + * @param fd other feature distribution (from the same feature) + * @return summed distribution information + */ + def reduce(fd: FeatureDistribution): FeatureDistribution = { + checkMatch(fd) + val combinedDist = distribution + fd.distribution + // summary info can be empty or min max if hist is empty but should otherwise match so take the longest info + val combinedSummary = if (summaryInfo.length > fd.summaryInfo.length) summaryInfo else fd.summaryInfo + FeatureDistribution(name, key, count + fd.count, nulls + fd.nulls, combinedDist, combinedSummary) + } + + /** + * Ratio of fill rates between the two distributions symetric with larger value on the top + * + * @param fd feature distribution to compare to + * @return ratio of fill rates + */ + def relativeFillRatio(fd: FeatureDistribution): Double = { + checkMatch(fd) + val (thisFill, thatFill) = (fillRate(), fd.fillRate()) + val (small, large) = if (thisFill < thatFill) (thisFill, thatFill) else (thatFill, thisFill) + if (small == 0.0) Double.PositiveInfinity else large / small + } + + /** + * Absolute difference in empty rates + * + * @param fd feature distribution to compare to + * @return absolute difference of rates + */ + def relativeFillRate(fd: FeatureDistribution): Double = { + checkMatch(fd) + math.abs(fillRate() - fd.fillRate()) + } + + /** + * Jensen-Shannon divergence from this distribution to the other distribution fed in + * + * @param fd other feature distribution + * @return the KL divergence + */ + def jsDivergence(fd: FeatureDistribution): Double = { + checkMatch(fd) + val combinedCounts = distribution.zip(fd.distribution).filterNot{ case (a, b) => a == 0.0 && b == 0.0 } + val (thisCount, thatCount) = combinedCounts + .fold[(Double, Double)]( (0, 0)){ case ((a1, b1), (a2, b2)) => (a1 + a2, b1 + b2) } + val probs = combinedCounts.map{ case (a, b) => a / thisCount -> b / thatCount } + val meanProb = probs.map{ case (a, b) => (a + b) / 2} + def log2(x: Double) = math.log10(x) / math.log10(2.0) + def klDivergence(a: Double, b: Double) = if (a == 0.0) 0.0 else a * log2(a / b) + probs.zip(meanProb).map{ case ((a, b), m) => 0.5 * klDivergence(a, m) + 0.5 * klDivergence(b, m) }.sum + } + + override def toString(): String = { + s"Name=$name, Key=$key, Count=$count, Nulls=$nulls, Histogram=${distribution.toList}, BinInfo=${summaryInfo.toList}" + } +} + +private[op] object FeatureDistribution { + + val MaxBins = 100000 + + implicit val semigroup: Semigroup[FeatureDistribution] = new Semigroup[FeatureDistribution] { + override def plus(l: FeatureDistribution, r: FeatureDistribution) = l.reduce(r) + } + + /** + * Facilitates feature distribution retrieval from computed feature summaries + * + * @param featureKey feature key + * @param summary feature summary + * @param value optional processed sequence + * @param bins number of histogram bins + * @param hasher hashing method to use for text and categorical features + * @return feature distribution given the provided information + */ + def apply( + featureKey: FeatureKey, + summary: Summary, + value: Option[ProcessedSeq], + bins: Int, + hasher: HashingTF + ): FeatureDistribution = { + val (nullCount, (summaryInfo, distribution)): (Int, (Array[Double], Array[Double])) = + value.map(seq => 0 -> histValues(seq, summary, bins, hasher)) + .getOrElse(1 -> (Array(summary.min, summary.max) -> Array.fill(bins)(0.0))) + + FeatureDistribution( + name = featureKey._1, + key = featureKey._2, + count = 1, + nulls = nullCount, + summaryInfo = summaryInfo, + distribution = distribution) + } + + /** + * Function to put data into histogram of counts + * @param values values to bin + * @param sum summary info for feature (max and min) + * @param bins number of bins to produce + * @param hasher hasing function to use for text + * @return the bin information and the binned counts + */ + // TODO avoid wrapping and unwrapping?? + private def histValues( + values: ProcessedSeq, + sum: Summary, + bins: Int, + hasher: HashingTF + ): (Array[Double], Array[Double]) = { + values match { + case Left(seq) => Array(sum.min, sum.max) -> hasher.transform(seq).toArray // TODO use summary info to pick hashes + case Right(seq) => // TODO use kernel fit instead of histogram + if (sum == Summary.empty) { + Array(sum.min, sum.max) -> seq.toArray // the seq will always be empty in this case + } else if (sum.min < sum.max) { + val step = (sum.max - sum.min) / (bins - 2.0) // total number of bins includes one for edge and one for other + val splits = (0 until bins).map(b => sum.min + step * b).toArray + val binned = seq.map { v => + NumericBucketizer.bucketize( + splits = splits, trackNulls = false, trackInvalid = true, + splitInclusion = Inclusion.Left, input = Option(v) + ).toArray + } + val hist = binned.fold(new Array[Double](bins))(_ + _) + splits -> hist + } else { + val same = seq.map(v => if (v == sum.max) 1.0 else 0.0).sum + val other = seq.map(v => if (v != sum.max) 1.0 else 0.0).sum + Array(sum.min, sum.max) -> Array(same, other) + } + } + } +} diff --git a/core/src/main/scala/com/salesforce/op/filters/PreparedFeatures.scala b/core/src/main/scala/com/salesforce/op/filters/PreparedFeatures.scala new file mode 100644 index 0000000000..412a947f38 --- /dev/null +++ b/core/src/main/scala/com/salesforce/op/filters/PreparedFeatures.scala @@ -0,0 +1,187 @@ +/* + * Copyright (c) 2017, Salesforce.com, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of Salesforce.com nor the names of its contributors may + * be used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +package com.salesforce.op.filters + +import com.salesforce.op.features.TransientFeature +import com.salesforce.op.features.types._ +import com.salesforce.op.stages.impl.feature.TextTokenizer +import com.salesforce.op.utils.spark.RichRow._ +import com.salesforce.op.utils.text.Language +import org.apache.spark.mllib.feature.HashingTF +import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors} +import org.apache.spark.sql.Row + +/** + * Class representing processed reponses and predictors keyed by their respective feature key + * + * @param responses prepared responses + * @param predictors prepared predictors + */ +private[filters] case class PreparedFeatures( + responses: Map[FeatureKey, ProcessedSeq], + predictors: Map[FeatureKey, ProcessedSeq]) { + + /** + * Computes summaries keyed by feature keys for this observation. + * + * @return pair consisting of response and predictor summaries (in this order) + */ + def summaries: (Map[FeatureKey, Summary], Map[FeatureKey, Summary]) = + responses.mapValues(Summary(_)) -> predictors.mapValues(Summary(_)) + + /** + * Computes vector of size responseKeys.length + predictorKeys.length. The first responses.length + * values are the actual response values (nulls replaced with 0.0). Its (i + responses.length)th value + * is 1 iff. the predictor associated to ith feature key is null, for i >= 0. + * + * @param responseKeys response feature keys + * @param predictorKeys set of all predictor keys needed for constructing binary vector + * @return null label-leakage correlation vector + */ + def getNullLabelLeakageVector(responseKeys: Array[FeatureKey], predictorKeys: Array[FeatureKey]): Vector = { + val responseValues = responseKeys.map(responses.get(_).collect { + case Right(Seq(d)) => d + }.getOrElse(0.0)) + val predictorNullIndicatorValues = predictorKeys.map(predictors.get(_).map(_ => 0.0).getOrElse(1.0)) + + Vectors.dense(responseValues ++ predictorNullIndicatorValues) + } + + /* + * Generates a pair of feature distribution arrays. The first element is associated to responses, + * and the second to predictors. + * + * @param responseSummaries global feature metadata + * @param predictorSummaries set of feature summary statistics (derived from metadata) + * @param bins number of bins to put numerics into + * @param hasher hash function to use on strings + * @return a pair consisting of response and predictor feature distributions (in this order) + */ + def getFeatureDistributions( + responseSummaries: Array[(FeatureKey, Summary)], + predictorSummaries: Array[(FeatureKey, Summary)], + bins: Int, + hasher: HashingTF + ): (Array[FeatureDistribution], Array[FeatureDistribution]) = { + val responseFeatureDistributions: Array[FeatureDistribution] = + getFeatureDistributions(responses, responseSummaries, bins, hasher) + val predictorFeatureDistributions: Array[FeatureDistribution] = + getFeatureDistributions(predictors, predictorSummaries, bins, hasher) + + responseFeatureDistributions -> predictorFeatureDistributions + } + + private def getFeatureDistributions( + features: Map[FeatureKey, ProcessedSeq], + summaries: Array[(FeatureKey, Summary)], + bins: Int, + hasher: HashingTF + ): Array[FeatureDistribution] = summaries.map { case (featureKey, summary) => + FeatureDistribution( + featureKey = featureKey, + summary = summary, + value = features.get(featureKey), + bins = bins, + hasher = hasher) + } +} + +private[filters] object PreparedFeatures { + + /** + * Retrieve prepared features from a given data frame row and transient features partition + * into responses and predictors. + * + * @param row data frame row + * @param responses transient features derived from responses + * @param predictors transient features derived from predictors + * @return set of prepared features + */ + def apply(row: Row, responses: Array[TransientFeature], predictors: Array[TransientFeature]): PreparedFeatures = { + val empty: Map[FeatureKey, ProcessedSeq] = Map() + val preparedResponses = responses.foldLeft(empty) { case (map, feature) => + val converter = FeatureTypeSparkConverter.fromFeatureTypeName(feature.typeName) + map ++ prepareFeature(feature.name, row.getFeatureType(feature)(converter)) + } + val preparedPredictors = predictors.foldLeft(empty) { case (map, feature) => + val converter = FeatureTypeSparkConverter.fromFeatureTypeName(feature.typeName) + map ++ prepareFeature(feature.name, row.getFeatureType(feature)(converter)) + } + + PreparedFeatures(responses = preparedResponses, predictors = preparedPredictors) + } + + /** + * Turn features into a sequence that will have stats computed on it based on the type of the feature + * + * @param name feature name + * @param value feature value + * @tparam T type of the feature + * @return tuple containing whether the feature was empty and a sequence of either doubles or strings + */ + private def prepareFeature[T <: FeatureType](name: String, value: T): Map[FeatureKey, ProcessedSeq] = + value match { + case v: Text => v.value + .map(s => Map[FeatureKey, ProcessedSeq]((name, None) -> Left(tokenize(s)))) + .getOrElse(Map()) + case v: OPNumeric[_] => v.toDouble + .map(d => Map[FeatureKey, ProcessedSeq]((name, None) -> Right(Seq(d)))) + .getOrElse(Map()) + case ft@SomeValue(v: DenseVector) => Map((name, None) -> Right(v.toArray.toSeq)) + case ft@SomeValue(v: SparseVector) => Map((name, None) -> Right(v.indices.map(_.toDouble).toSeq)) + case ft@SomeValue(_) => ft match { + case v: Geolocation => Map((name, None) -> Right(v.value)) + case v: TextList => Map((name, None) -> Left(v.value)) + case v: DateList => Map((name, None) -> Right(v.value.map(_.toDouble))) + case v: MultiPickList => Map((name, None) -> Left(v.value.toSeq)) + case v: MultiPickListMap => v.value.map { case (k, e) => (name, Option(k)) -> Left(e.toSeq) } + case v: GeolocationMap => v.value.map{ case (k, e) => (name, Option(k)) -> Right(e) } + case v: OPMap[_] => v.value.map { case (k, e) => e match { + case d: Double => (name, Option(k)) -> Right(Seq(d)) + // Do not need to distinguish between string map types, all text is tokenized for distribution calculation + case s: String => (name, Option(k)) -> Left(tokenize(s)) + case l: Long => (name, Option(k)) -> Right(Seq(l.toDouble)) + case b: Boolean => (name, Option(k)) -> Right(Seq(if (b) 1.0 else 0.0)) + }} + case _ => throw new RuntimeException(s"Feature type $value is not supported in RawFeatureFilter") + } + case _ => Map() + } + + /** + * Tokenizes an input string. + * + * @param s input string + * @return array of string tokens + */ + private def tokenize(s: String) = TextTokenizer.Analyzer.analyze(s, Language.Unknown) +} diff --git a/core/src/main/scala/com/salesforce/op/filters/RawFeatureFilter.scala b/core/src/main/scala/com/salesforce/op/filters/RawFeatureFilter.scala index 42ff002dc8..34c9bff908 100644 --- a/core/src/main/scala/com/salesforce/op/filters/RawFeatureFilter.scala +++ b/core/src/main/scala/com/salesforce/op/filters/RawFeatureFilter.scala @@ -31,19 +31,23 @@ package com.salesforce.op.filters +import scala.math.{abs, min} + import com.salesforce.op.OpParams import com.salesforce.op.features.types._ import com.salesforce.op.features.{OPFeature, TransientFeature} -import com.salesforce.op.filters.FeatureDistrib.ProcessedSeq import com.salesforce.op.readers.{DataFrameFieldNames, Reader} import com.salesforce.op.stages.impl.feature.{HashAlgorithm, Inclusion, NumericBucketizer, TextTokenizer} +import com.salesforce.op.stages.impl.preparators.CorrelationType import com.salesforce.op.utils.spark.RichRow._ -import com.salesforce.op.utils.text.Language import com.twitter.algebird.Monoid import com.twitter.algebird.Semigroup import com.twitter.algebird.Monoid._ import com.twitter.algebird.Operators._ import org.apache.spark.mllib.feature.HashingTF +import org.apache.spark.mllib.linalg.{Matrix, Vector} +import org.apache.spark.mllib.stat.Statistics +import org.apache.spark.rdd.RDD import org.apache.spark.sql.types.StructType import org.apache.spark.sql.{DataFrame, Row, SparkSession} import org.slf4j.LoggerFactory @@ -59,6 +63,9 @@ import org.slf4j.LoggerFactory * @param maxFillDifference maximum acceptable fill rate difference between training and scoring data to be kept * @param maxFillRatioDiff maximum acceptable fill ratio between training and scoring (larger / smaller) * @param maxJSDivergence maximum Jensen-Shannon divergence between training and scoring distributions to be kept + * @param maxCorrelation maximum absolute correlation allowed between raw predictor null indicator and label + * @param correlationType type of correlation metric to use + * @param jsDivergenceProtectedFeatures features that are protected from removal by JS divergence check * @param protectedFeatures features that are protected from removal * @tparam T datatype of the reader */ @@ -71,11 +78,14 @@ class RawFeatureFilter[T] val maxFillDifference: Double, val maxFillRatioDiff: Double, val maxJSDivergence: Double, + val maxCorrelation: Double, + val correlationType: CorrelationType = CorrelationType.Pearson, + val jsDivergenceProtectedFeatures: Set[String] = Set.empty, val protectedFeatures: Set[String] = Set.empty ) extends Serializable { - assert(bins > 1 && bins <= FeatureDistrib.MaxBins, s"Invalid bin size $bins," + - s" bins must be between 1 and ${FeatureDistrib.MaxBins}") + assert(bins > 1 && bins <= FeatureDistribution.MaxBins, s"Invalid bin size $bins," + + s" bins must be between 1 and ${FeatureDistribution.MaxBins}") assert(minFill >= 0.0 && minFill <= 1.0, s"Invalid minFill size $minFill, minFill must be between 0 and 1") assert(maxFillDifference >= 0.0 && maxFillDifference <= 1.0, s"Invalid maxFillDifference size $maxFillDifference," + s" maxFillDifference must be between 0 and 1") @@ -90,80 +100,69 @@ class RawFeatureFilter[T] .setBinary(false) .setHashAlgorithm(HashAlgorithm.MurMur3.toString.toLowerCase) - private def tokenize(s: String) = TextTokenizer.Analyzer.analyze(s, Language.Unknown) - - /** - * Turn features into a sequence that will have stats computed on it based on the type of the feature - * @param value feature value - * @tparam T type of the feature - * @return a tuple containing whether the feature was empty and a sequence of either doubles or strings - */ - private def prepareFeatures[T <: FeatureType](value: T): (Boolean, ProcessedSeq) = { - value match { - case v: Text => v.isEmpty -> Left(v.value.map(tokenize).getOrElse(Seq.empty)) // TODO are empty strings == nulls - case v: OPNumeric[_] => v.isEmpty -> Right(v.toDouble.toSeq) - case v: OPVector => v.isEmpty -> Right(v.value.toArray.toSeq) - case v: Geolocation => v.isEmpty -> Right(v.value) - case v: TextList => v.isEmpty -> Left(v.value) - case v: DateList => v.isEmpty -> Right(v.value.map(_.toDouble)) - case v: MultiPickList => v.isEmpty -> Left(v.value.toSeq) - case _ => throw new RuntimeException(s"Feature type $value is not supported in RawFeatureFilter") - } - } - - - /** - * Turn map features into a map of sequences that will have stats computed on it based on the type of the feature - * @param value feature value - * @tparam T type of the map feature - * @return a map from the keys to a sequence of either doubles or strings - */ - private def prepareMapFeatures[T <: FeatureType](value: T): Map[String, ProcessedSeq] = { - value match { - case v: MultiPickListMap => v.value.map{ case (k, e) => k -> Left(e.toSeq) } - case v: GeolocationMap => v.value.map{ case (k, e) => k -> Right(e) } - case v: OPMap[_] => v.value.map { case (k, e) => e match { - case d: Double => k -> Right(Seq(d)) - case s: String => k -> Left(tokenize(s)) - case l: Long => k -> Right(Seq(l.toDouble)) - case b: Boolean => k -> Right(Seq(if (b) 1.0 else 0.0)) - }} - case _ => throw new RuntimeException(s"Feature type $value is not supported in RawFeatureFilter") - } - } /** * Get binned counts of the feature distribution and empty count for each raw feature * @param data data frame to compute counts on - * @param rawFeatures list of raw features contained in the dataframe + * @param features list of raw, non-protected, features contained in the dataframe + * @param allFeatureInfo existing feature info to use * @return a sequence of distribution summaries for each raw feature */ - // TODO do these computations on a per label basis?? - private[op] def computeFeatureStats(data: DataFrame, rawFeatures: Array[OPFeature], - featureSummaries: Option[AllFeatureInformation] = None): AllFeatureInformation = { - val (mapTranFeatures, tranFeatures) = rawFeatures - .map(f => TransientFeature(f) -> FeatureTypeSparkConverter()(f.wtt)) - .partition(_._1.getFeature().isSubtypeOf[OPMap[_]]) + private[op] def computeFeatureStats( + data: DataFrame, + features: Array[OPFeature], + allFeatureInfo: Option[AllFeatureInformation] = None): AllFeatureInformation = { + val (responses, predictors): (Array[TransientFeature], Array[TransientFeature]) = { + val (allResponses, allPredictors) = features.partition(_.isResponse) + val respOut = allResponses.map(TransientFeature(_)).flatMap { + case f if f.getFeature().isSubtypeOf[OPNumeric[_]] => + log.info("Using numeric response: {}", f.name) + Option(f) + case f => + log.info("Not using non-numeric response in raw feature filter: {}", f.name) + None + } + val predOut = allPredictors.map(TransientFeature(_)) - val preparedFeatures = data.rdd.map{ row => - tranFeatures.map(f => prepareFeatures(row.getFeatureType(f._1)(f._2))) -> - mapTranFeatures.map(mf => prepareMapFeatures(row.getFeatureType(mf._1)(mf._2))) + (respOut, predOut) } - - val (summaryFeatures, summaryMapFeatures) = // Have to use the training summaries do process scoring for comparison - featureSummaries.map{ fs => fs.featureSummaries -> fs.mapFeatureSummaries }.getOrElse{ - preparedFeatures.map { case (features, mapFeatures) => - features.map(f => Summary(f._2)) -> mapFeatures.map(mf => mf.map { case (k, v) => k -> Summary(v) }) - }.reduce(_ + _) + val preparedFeatures: RDD[PreparedFeatures] = + data.rdd.map(PreparedFeatures(_, responses, predictors)) + // Have to use the training summaries do process scoring for comparison + val (responseSummaries, predictorSummaries): (Map[FeatureKey, Summary], Map[FeatureKey, Summary]) = + allFeatureInfo.map(info => info.responseSummaries -> info.predictorSummaries) + .getOrElse(preparedFeatures.map(_.summaries).reduce(_ + _)) + val (responseSummariesArr, predictorSummariesArr): (Array[(FeatureKey, Summary)], Array[(FeatureKey, Summary)]) = + (responseSummaries.toArray, predictorSummaries.toArray) + val (responseDistributions, predictorDistributions): (Array[FeatureDistribution], Array[FeatureDistribution]) = + preparedFeatures + .map(_.getFeatureDistributions( + responseSummaries = responseSummariesArr, + predictorSummaries = predictorSummariesArr, + bins = bins, + hasher = hasher)) + .reduce(_ + _) // NOTE: resolved semigroup is IndexedSeqSemigroup + val correlationInfo: Map[FeatureKey, Map[FeatureKey, Double]] = + allFeatureInfo.map(_.correlationInfo).getOrElse { + val emptyCorr: Map[FeatureKey, Map[FeatureKey, Double]] = Map() + val responseKeys: Array[FeatureKey] = responseSummariesArr.map(_._1) + val predictorKeys: Array[FeatureKey] = predictorSummariesArr.map(_._1) + val corrRDD: RDD[Vector] = preparedFeatures.map(_.getNullLabelLeakageVector(responseKeys, predictorKeys)) + val corrMatrix: Matrix = Statistics.corr(corrRDD, correlationType.sparkName) + + responseKeys.zipWithIndex.map { case (responseKey, i) => + responseKey -> predictorKeys.zipWithIndex.map { case (predictorKey, j) => + predictorKey -> min(abs(corrMatrix(i, j + responseKeys.length)), 1.0) + }.toMap + }.toMap } - val featureDistrib = preparedFeatures - .map{ case (features, mapFeatures) => - FeatureDistrib.getDistributions(tranFeatures.map(_._1), features, summaryFeatures, bins, hasher) ++ - FeatureDistrib.getMapDistributions(mapTranFeatures.map(_._1), mapFeatures, summaryMapFeatures, bins, hasher) } - .reduce(_ + _) - - AllFeatureInformation(summaryFeatures, summaryMapFeatures, featureDistrib) + AllFeatureInformation( + responseSummaries = responseSummaries, + responseDistributions = responseDistributions, + predictorSummaries = predictorSummaries, + predictorDistributions = predictorDistributions, + correlationInfo = correlationInfo) } /** @@ -171,12 +170,14 @@ class RawFeatureFilter[T] * features should be dropped (including maps with all keys dropped) and which map keys need to be dropped * @param trainingDistribs summary of distributions for training data features * @param scoringDistribs summary of distributions for scoring data features (may be an empty seq) + * @param correlationInfo info needed to determine feature to drop based on null label-leakage correlation * @return a list of feature names that should be dropped and a map of map keys that should be dropped * Map(featureName -> key) */ private[op] def getFeaturesToExclude( - trainingDistribs: Seq[FeatureDistrib], - scoringDistribs: Seq[FeatureDistrib] + trainingDistribs: Seq[FeatureDistribution], + scoringDistribs: Seq[FeatureDistribution], + correlationInfo: Map[FeatureKey, Map[FeatureKey, Double]] ): (Seq[String], Map[String, Set[String]]) = { def logExcluded(excluded: Seq[Boolean], message: String): Unit = { @@ -185,14 +186,29 @@ class RawFeatureFilter[T] log.info(s"$message: ${featuresDropped.mkString(", ")}") } - val featureSize = trainingDistribs.size + val featureSize = trainingDistribs.length val trainingUnfilled = trainingDistribs.map(_.fillRate() < minFill) logExcluded(trainingUnfilled, s"Features excluded because training fill rate did not meet min required ($minFill)") + val trainingNullLabelLeakers = { + if (correlationInfo.isEmpty) Seq.fill(featureSize)(false) + else { + val absoluteCorrs = correlationInfo.map(_._2) + for {distrib <- trainingDistribs} yield { + // Only filter if feature absolute null-label leakage correlation is greater than allowed correlation + val nullLabelLeakerIndicators = absoluteCorrs.map(_.get(distrib.featureKey).exists(_ > maxCorrelation)) + nullLabelLeakerIndicators.exists(identity(_)) + } + } + } + logExcluded( + trainingNullLabelLeakers, + s"Features excluded because null indicator correlation (absolute) exceeded max allowed ($maxCorrelation)") + val scoringUnfilled = if (scoringDistribs.nonEmpty) { - assert(scoringDistribs.length == trainingDistribs.length, "scoring and training features must match") + assert(scoringDistribs.length == featureSize, "scoring and training features must match") val su = scoringDistribs.map(_.fillRate() < minFill) logExcluded(su, s"Features excluded because scoring fill rate did not meet min required ($minFill)") su @@ -206,7 +222,9 @@ class RawFeatureFilter[T] log.info(combined.map { case (t, s) => s"\n$t\n$s\nTrain Fill=${t.fillRate()}, Score Fill=${s.fillRate()}, " + s"JS Divergence=${t.jsDivergence(s)}, Fill Rate Difference=${t.relativeFillRate(s)}, " + s"Fill Ratio Difference=${t.relativeFillRatio(s)}" }.mkString("\n")) - val kl = combined.map { case (t, s) => t.jsDivergence(s) > maxJSDivergence } + val kl = combined.map { case (t, s) => + !jsDivergenceProtectedFeatures.contains(t.name) && t.jsDivergence(s) > maxJSDivergence + } logExcluded(kl, s"Features excluded because JS Divergence exceeded max allowed ($maxJSDivergence)") val mf = combined.map { case (t, s) => t.relativeFillRate(s) > maxFillDifference } logExcluded(mf, s"Features excluded because fill rate difference exceeded max allowed ($maxFillDifference)") @@ -217,8 +235,8 @@ class RawFeatureFilter[T] Seq.fill(featureSize)(false) } - val allExcludeReasons = trainingUnfilled.zip(scoringUnfilled).zip(distribMismatches) - .map{ case ((t, s), d) => t || s || d } + val allExcludeReasons = trainingUnfilled.zip(scoringUnfilled).zip(distribMismatches).zip(trainingNullLabelLeakers) + .map{ case (((t, s), d), n) => t || s || d || n } val (toDrop, toKeep) = trainingDistribs.zip(allExcludeReasons).partition(_._2) @@ -243,14 +261,12 @@ class RawFeatureFilter[T] def generateFilteredRaw(rawFeatures: Array[OPFeature], parameters: OpParams) (implicit spark: SparkSession): (DataFrame, Array[OPFeature]) = { - val (_, predictorFeatures) = rawFeatures.partition(f => f.isResponse || protectedFeatures.contains(f.name) ) - val trainData = trainingReader.generateDataFrame(rawFeatures, parameters).persist() log.info("Loaded training data") assert(trainData.count() > 0, "RawFeatureFilter cannot work with empty training data") - val trainingSummary = computeFeatureStats(trainData, predictorFeatures) // TODO also response summaries?? + val trainingSummary = computeFeatureStats(trainData, rawFeatures) // TODO also response summaries?? log.info("Computed summary stats for training features") - log.debug(trainingSummary.featureDistributions.mkString("\n")) + log.debug(trainingSummary.predictorDistributions.mkString("\n")) val scoreData = scoreReader.flatMap{ s => val sd = s.generateDataFrame(rawFeatures, parameters.switchReaderParams()).persist() @@ -263,16 +279,16 @@ class RawFeatureFilter[T] } val scoringSummary = scoreData.map{ sd => - val ss = computeFeatureStats(sd, predictorFeatures, Some(trainingSummary)) // TODO also response summaries?? + val ss = computeFeatureStats(sd, rawFeatures, Some(trainingSummary)) // TODO also response summaries?? log.info("Computed summary stats for scoring features") - log.debug(ss.featureDistributions.mkString("\n")) + log.debug(ss.predictorDistributions.mkString("\n")) ss } val (featuresToDropNames, mapKeysToDrop) = getFeaturesToExclude( - trainingSummary.featureDistributions, - scoringSummary.toSeq.flatMap(_.featureDistributions) - ) + trainingSummary.predictorDistributions.filterNot(d => protectedFeatures.contains(d.name)), + scoringSummary.toSeq.flatMap(_.predictorDistributions.filterNot(d => protectedFeatures.contains(d.name))), + trainingSummary.correlationInfo) val (featuresToDrop, featuresToKeep) = rawFeatures.partition(rf => featuresToDropNames.contains(rf.name)) val featuresToKeepNames = Array(DataFrameFieldNames.KeyFieldName) ++ featuresToKeep.map(_.name) @@ -297,227 +313,3 @@ class RawFeatureFilter[T] cleanedData -> featuresToDrop } } - -private[op] case class AllFeatureInformation -( - featureSummaries: Array[Summary], - mapFeatureSummaries: Array[Map[String, Summary]], - featureDistributions: Array[FeatureDistrib] -) - -/** - * Class used to get summaries of prepped features so know how to bin it for distributions - * @param min minimum value seen - * @param max maximum value seen - */ -private[op] case class Summary(min: Double, max: Double) - -private[op] case object Summary { - - val empty: Summary = Summary(Double.PositiveInfinity, Double.NegativeInfinity) - - implicit val monoid: Monoid[Summary] = new Monoid[Summary] { - override def zero = empty - override def plus(l: Summary, r: Summary) = Summary(math.min(l.min, r.min), math.max(l.max, r.max)) - } - - def apply(preppedFeature: ProcessedSeq): Summary = { - preppedFeature match { - case Left(v) => Summary(v.size, v.size) - case Right(v) => monoid.sum(v.map(d => Summary(d, d))) - } - } -} - - -/** - * Class containing summary information for a feature - * @param name name of the feature - * @param key map key associated with distribution (when the feature is a map) - * @param count total count of feature seen - * @param nulls number of empties seen in feature - * @param distribution binned counts of feature values (hashed for strings, evently spaced bins for numerics) - * @param summaryInfo either min and max of data (for text data) or splits used for bins for numeric data - */ -case class FeatureDistrib -( - name: String, - key: Option[String], - count: Long, - nulls: Long, - distribution: Array[Double], - summaryInfo: Array[Double] -) { - - /** - * Check that feature distributions below to the same feature and key - * @param fd distribution to compare to - */ - def checkMatch(fd: FeatureDistrib): Unit = - assert(name == fd.name && key == fd.key, "Name and key must match to compare or combine FeatureDistrib") - - /** - * Get fill rate of feature - * @return fraction of data that is non empty - */ - def fillRate(): Double = if (count == 0L) 0.0 else (count - nulls) / count.toDouble - - /** - * Combine feature distributions - * @param fd other feature distribution (from the same feature) - * @return summed distribution information - */ - def reduce(fd: FeatureDistrib): FeatureDistrib = { - checkMatch(fd) - val combinedDist = distribution + fd.distribution - // summary info can be empty or min max if hist is empty but should otherwise match so take the longest info - val combinedSummary = if (summaryInfo.length > fd.summaryInfo.length) summaryInfo else fd.summaryInfo - FeatureDistrib(name, key, count + fd.count, nulls + fd.nulls, combinedDist, combinedSummary) - } - - /** - * Ratio of fill rates between the two distributions symetric with larger value on the top - * @param fd feature distribution to compare to - * @return ratio of fill rates - */ - def relativeFillRatio(fd: FeatureDistrib): Double = { - checkMatch(fd) - val (thisFill, thatFill) = (fillRate(), fd.fillRate()) - val (small, large) = if (thisFill < thatFill) (thisFill, thatFill) else (thatFill, thisFill) - if (small == 0.0) Double.PositiveInfinity else large / small - } - - /** - * Absolute difference in empty rates - * @param fd feature distribution to compare to - * @return fill rate ratio with larger fill rate on the bottom - */ - def relativeFillRate(fd: FeatureDistrib): Double = { - checkMatch(fd) - math.abs(fillRate() - fd.fillRate()) - } - - /** - * Jensen-Shannon divergence from this distribution to the other distribution fed in - * @param fd other feature distribution - * @return the KL divergence - */ - def jsDivergence(fd: FeatureDistrib): Double = { - checkMatch(fd) - val combinedCounts = distribution.zip(fd.distribution).filterNot{ case (a, b) => a == 0.0 && b == 0.0 } - val (thisCount, thatCount) = combinedCounts - .fold[(Double, Double)]( (0, 0)){ case ((a1, b1), (a2, b2)) => (a1 + a2, b1 + b2) } - val probs = combinedCounts.map{ case (a, b) => a / thisCount -> b / thatCount } - val meanProb = probs.map{ case (a, b) => (a + b) / 2} - def log2(x: Double) = math.log10(x) / math.log10(2.0) - def klDivergence(a: Double, b: Double) = if (a == 0.0) 0.0 else a * log2(a / b) - probs.zip(meanProb).map{ case ((a, b), m) => 0.5 * klDivergence(a, m) + 0.5 * klDivergence(b, m) }.sum - } - - override def toString(): String = { - s"Name=$name, Key=$key, Count=$count, Nulls=$nulls, Histogram=${distribution.toList}, BinInfo=${summaryInfo.toList}" - } -} - -private[op] case object FeatureDistrib { - - type ProcessedSeq = Either[Seq[String], Seq[Double]] - - val MaxBins = 100000 - - implicit val semigroup: Semigroup[FeatureDistrib] = new Semigroup[FeatureDistrib] { - override def plus(l: FeatureDistrib, r: FeatureDistrib) = l.reduce(r) - } - - /** - * Function to put data into histogram of counts - * @param values values to bin - * @param sum summary info for feature (max and min) - * @param bins number of bins to produce - * @param hasher hasing function to use for text - * @return the bin information and the binned counts - */ - // TODO avoid wrapping and unwrapping?? - private def histValues( - values: ProcessedSeq, - sum: Summary, - bins: Int, - hasher: HashingTF - ): (Array[Double], Array[Double]) = { - values match { - case Left(seq) => Array(sum.min, sum.max) -> hasher.transform(seq).toArray // TODO use summary info to pick hashes - case Right(seq) => // TODO use kernel fit instead of histogram - if (sum == Summary.empty) { - Array(sum.min, sum.max) -> seq.toArray // the seq will always be empty in this case - } else if (sum.min < sum.max) { - val step = (sum.max - sum.min) / (bins - 2.0) // total number of bins includes one for edge and one for other - val splits = (0 until bins).map(b => sum.min + step * b).toArray - val binned = seq.map { v => - NumericBucketizer.bucketize( - splits = splits, trackNulls = false, trackInvalid = true, - splitInclusion = Inclusion.Left, input = Option(v) - ).toArray - } - val hist = binned.fold(new Array[Double](bins))(_ + _) - splits -> hist - } else { - val same = seq.map(v => if (v == sum.max) 1.0 else 0.0).sum - val other = seq.map(v => if (v != sum.max) 1.0 else 0.0).sum - Array(sum.min, sum.max) -> Array(same, other) - } - } - } - - /** - * Create the distributions for regular features - * @param features list of transient features - * @param values values of the features processed into a sequence of either doubles or strings with boolean - * indicating if original feature was empty - * @param summary summary statistics about feature - * @param bins number of bins to put numerics into - * @param hasher hash function to use on strings - * @return feature distribution for single feature value to be aggregated - */ - def getDistributions( - features: Array[TransientFeature], - values: Array[(Boolean, ProcessedSeq)], - summary: Array[Summary], - bins: Int, - hasher: HashingTF - ): Array[FeatureDistrib] = { - features.zip(values).zip(summary).map{ - case ((tf, (isNull, seq)), sum) => - val isNullCount = if (isNull) 1 else 0 - val (info, histogram) = histValues(seq, sum, bins, hasher) - FeatureDistrib(tf.name, None, 1, isNullCount, histogram, info) - } - } - - /** - * Create the distributions for map features - * @param features list of transient map features - * @param values values of the features processed into a map from key to sequence of either doubles or strings - * @param summary map from key to summary statistics about feature - * @param bins number of bins to put numerics into - * @param hasher hash function to use on strings - * @return feature distribution for single feature and key value to be aggregated - */ - def getMapDistributions( - features: Array[TransientFeature], - values: Array[Map[String, ProcessedSeq]], - summary: Array[Map[String, Summary]], - bins: Int, - hasher: HashingTF - ): Array[FeatureDistrib] = { - features.zip(values).zip(summary).flatMap { - case ((tf, map), sum) => sum.map { case (key, seq) => - val isNullCount = if (map.contains(key)) 0 else 1 - val (info, histogram) = map.get(key) - .map(seq => histValues(seq, sum(key), bins, hasher)) - .getOrElse(Array(sum(key).min, sum(key).max), Array.fill(bins)(0.0)) - FeatureDistrib(tf.name, Some(key), 1, isNullCount, histogram, info) - } - } - } - -} diff --git a/core/src/main/scala/com/salesforce/op/filters/Summary.scala b/core/src/main/scala/com/salesforce/op/filters/Summary.scala new file mode 100644 index 0000000000..810f642b3d --- /dev/null +++ b/core/src/main/scala/com/salesforce/op/filters/Summary.scala @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2017, Salesforce.com, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of Salesforce.com nor the names of its contributors may + * be used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +package com.salesforce.op.filters + +import com.twitter.algebird.Monoid + +/** + * Class used to get summaries of prepared features to determine distribution binning strategy + * + * @param min minimum value seen + * @param max maximum value seen + */ +private[op] case class Summary(min: Double, max: Double) + +private[op] case object Summary { + + val empty: Summary = Summary(Double.PositiveInfinity, Double.NegativeInfinity) + + implicit val monoid: Monoid[Summary] = new Monoid[Summary] { + override def zero = empty + override def plus(l: Summary, r: Summary) = Summary(math.min(l.min, r.min), math.max(l.max, r.max)) + } + + /** + * @param preppedFeature processed feature + * @return feature summary derived from processed feature + */ + def apply(preppedFeature: ProcessedSeq): Summary = { + preppedFeature match { + case Left(v) => Summary(v.size, v.size) + case Right(v) => monoid.sum(v.map(d => Summary(d, d))) + } + } +} diff --git a/core/src/main/scala/com/salesforce/op/filters/package.scala b/core/src/main/scala/com/salesforce/op/filters/package.scala new file mode 100644 index 0000000000..7e5971bb29 --- /dev/null +++ b/core/src/main/scala/com/salesforce/op/filters/package.scala @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2017, Salesforce.com, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of Salesforce.com nor the names of its contributors may + * be used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +package com.salesforce.op + +// scalastyle:off ensure.single.space.after.token +package object filters { + private[filters] type FeatureKey = (String, Option[String]) + private[filters] type ProcessedSeq = Either[Seq[String], Seq[Double]] +} diff --git a/core/src/main/scala/com/salesforce/op/stages/impl/classification/OpDecisionTreeClassifier.scala b/core/src/main/scala/com/salesforce/op/stages/impl/classification/OpDecisionTreeClassifier.scala new file mode 100644 index 0000000000..e4da69bd8a --- /dev/null +++ b/core/src/main/scala/com/salesforce/op/stages/impl/classification/OpDecisionTreeClassifier.scala @@ -0,0 +1,121 @@ +/* + * Copyright (c) 2017, Salesforce.com, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of Salesforce.com nor the names of its contributors may + * be used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +package com.salesforce.op.stages.impl.classification + +import com.salesforce.op.UID +import com.salesforce.op.features.types.{OPVector, Prediction, RealNN} +import com.salesforce.op.stages.impl.CheckIsResponseValues +import com.salesforce.op.stages.sparkwrappers.specific.{OpPredictorWrapper, OpProbabilisticClassifierModel} +import com.salesforce.op.utils.reflection.ReflectionUtils.reflectMethod +import org.apache.spark.ml.classification.{DecisionTreeClassificationModel, DecisionTreeClassifier, OpDecisionTreeClassifierParams} + +import scala.reflect.runtime.universe.TypeTag + +/** + * Wrapper for spark decision tree classifier [[org.apache.spark.ml.classification.DecisionTreeClassifier]] + * @param uid stage uid + */ +class OpDecisionTreeClassifier(uid: String = UID[OpDecisionTreeClassifier]) + extends OpPredictorWrapper[DecisionTreeClassifier, DecisionTreeClassificationModel]( + predictor = new DecisionTreeClassifier(), + uid = uid + ) with OpDecisionTreeClassifierParams { + + override protected def onSetInput(): Unit = { + super.onSetInput() + CheckIsResponseValues(in1, in2) + } + + /** @group setParam */ + override def setMaxDepth(value: Int): this.type = set(maxDepth, value) + + /** @group setParam */ + override def setMaxBins(value: Int): this.type = set(maxBins, value) + + /** @group setParam */ + override def setMinInstancesPerNode(value: Int): this.type = set(minInstancesPerNode, value) + + /** @group setParam */ + override def setMinInfoGain(value: Double): this.type = set(minInfoGain, value) + + /** @group expertSetParam */ + override def setMaxMemoryInMB(value: Int): this.type = set(maxMemoryInMB, value) + + /** @group expertSetParam */ + override def setCacheNodeIds(value: Boolean): this.type = set(cacheNodeIds, value) + + /** + * Specifies how often to checkpoint the cached node IDs. + * E.g. 10 means that the cache will get checkpointed every 10 iterations. + * This is only used if cacheNodeIds is true and if the checkpoint directory is set in + * [[org.apache.spark.SparkContext]]. + * Must be at least 1. + * (default = 10) + * @group setParam + */ + override def setCheckpointInterval(value: Int): this.type = set(checkpointInterval, value) + + /** @group setParam */ + override def setImpurity(value: String): this.type = set(impurity, value) + + /** @group setParam */ + override def setSeed(value: Long): this.type = set(seed, value) + +} + + +/** + * Class that takes in a spark DecisionTreeClassificationModel and wraps it into an OP model which returns a + * Prediction feature + * + * @param sparkModel model to wrap + * @param uid uid to give stage + * @param operationName unique name of the operation this stage performs + */ +class OpDecisionTreeClassificationModel +( + sparkModel: DecisionTreeClassificationModel, + uid: String = UID[OpDecisionTreeClassificationModel], + operationName: String = classOf[DecisionTreeClassifier].getSimpleName +)( + implicit tti1: TypeTag[RealNN], + tti2: TypeTag[OPVector], + tto: TypeTag[Prediction], + ttov: TypeTag[Prediction#Value] +) extends OpProbabilisticClassifierModel[DecisionTreeClassificationModel]( + sparkModel = sparkModel, uid = uid, operationName = operationName +) { + @transient lazy val predictRawMirror = reflectMethod(getSparkMlStage().get, "predictRaw") + @transient lazy val raw2probabilityMirror = reflectMethod(getSparkMlStage().get, "raw2probability") + @transient lazy val probability2predictionMirror = + reflectMethod(getSparkMlStage().get, "probability2prediction") +} diff --git a/core/src/main/scala/com/salesforce/op/stages/impl/classification/OpGBTClassifier.scala b/core/src/main/scala/com/salesforce/op/stages/impl/classification/OpGBTClassifier.scala new file mode 100644 index 0000000000..e8f299010c --- /dev/null +++ b/core/src/main/scala/com/salesforce/op/stages/impl/classification/OpGBTClassifier.scala @@ -0,0 +1,148 @@ +/* + * Copyright (c) 2017, Salesforce.com, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of Salesforce.com nor the names of its contributors may + * be used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +package com.salesforce.op.stages.impl.classification + +import com.salesforce.op.UID +import com.salesforce.op.features.types.{OPVector, Prediction, RealNN} +import com.salesforce.op.stages.impl.CheckIsResponseValues +import com.salesforce.op.stages.sparkwrappers.specific.{OpPredictorWrapper, OpProbabilisticClassifierModel} +import com.salesforce.op.utils.reflection.ReflectionUtils.reflectMethod +import org.apache.spark.ml.classification.{GBTClassificationModel, GBTClassifier, OpGBTClassifierParams} + +import scala.reflect.runtime.universe.TypeTag + +/** + * Wrapper for spark GBT classifier [[org.apache.spark.ml.classification.GBTClassifier]] + * @param uid stage uid + */ +class OpGBTClassifier(uid: String = UID[OpGBTClassifier]) + extends OpPredictorWrapper[GBTClassifier, GBTClassificationModel]( + predictor = new GBTClassifier(), + uid = uid + ) with OpGBTClassifierParams { + + override protected def onSetInput(): Unit = { + super.onSetInput() + CheckIsResponseValues(in1, in2) + } + + /** @group setParam */ + override def setMaxDepth(value: Int): this.type = set(maxDepth, value) + + /** @group setParam */ + override def setMaxBins(value: Int): this.type = set(maxBins, value) + + /** @group setParam */ + override def setMinInstancesPerNode(value: Int): this.type = set(minInstancesPerNode, value) + + /** @group setParam */ + override def setMinInfoGain(value: Double): this.type = set(minInfoGain, value) + + /** @group expertSetParam */ + override def setMaxMemoryInMB(value: Int): this.type = set(maxMemoryInMB, value) + + /** @group expertSetParam */ + override def setCacheNodeIds(value: Boolean): this.type = set(cacheNodeIds, value) + + /** + * Specifies how often to checkpoint the cached node IDs. + * E.g. 10 means that the cache will get checkpointed every 10 iterations. + * This is only used if cacheNodeIds is true and if the checkpoint directory is set in + * [[org.apache.spark.SparkContext]]. + * Must be at least 1. + * (default = 10) + * @group setParam + */ + override def setCheckpointInterval(value: Int): this.type = set(checkpointInterval, value) + + /** + * The impurity setting is ignored for GBT models. + * Individual trees are built using impurity "Variance." + * + * @group setParam + */ + override def setImpurity(value: String): this.type = { + logWarning("GBTClassifier.setImpurity should NOT be used") + this + } + + // Parameters from TreeEnsembleParams: + + /** @group setParam */ + override def setSubsamplingRate(value: Double): this.type = set(subsamplingRate, value) + + /** @group setParam */ + override def setSeed(value: Long): this.type = set(seed, value) + + // Parameters from GBTParams: + + /** @group setParam */ + override def setMaxIter(value: Int): this.type = set(maxIter, value) + + /** @group setParam */ + override def setStepSize(value: Double): this.type = set(stepSize, value) + + // Parameters from GBTClassifierParams: + + /** @group setParam */ + def setLossType(value: String): this.type = set(lossType, value) +} + + + +/** + * Class that takes in a spark GBTClassificationModel and wraps it into an OP model which returns a + * Prediction feature + * + * @param sparkModel model to wrap + * @param uid uid to give stage + * @param operationName unique name of the operation this stage performs + */ +class OpGBTClassificationModel +( + sparkModel: GBTClassificationModel, + uid: String = UID[OpGBTClassificationModel], + operationName: String = classOf[GBTClassifier].getSimpleName +)( + implicit tti1: TypeTag[RealNN], + tti2: TypeTag[OPVector], + tto: TypeTag[Prediction], + ttov: TypeTag[Prediction#Value] +) extends OpProbabilisticClassifierModel[GBTClassificationModel]( + sparkModel = sparkModel, uid = uid, operationName = operationName +) { + @transient lazy val predictRawMirror = reflectMethod(getSparkMlStage().get, "predictRaw") + @transient lazy val raw2probabilityMirror = reflectMethod(getSparkMlStage().get, "raw2probability") + @transient lazy val probability2predictionMirror = + reflectMethod(getSparkMlStage().get, "probability2prediction") +} + diff --git a/core/src/main/scala/com/salesforce/op/stages/impl/classification/OpLinearSVC.scala b/core/src/main/scala/com/salesforce/op/stages/impl/classification/OpLinearSVC.scala new file mode 100644 index 0000000000..0fe69ef3e8 --- /dev/null +++ b/core/src/main/scala/com/salesforce/op/stages/impl/classification/OpLinearSVC.scala @@ -0,0 +1,167 @@ +/* + * Copyright (c) 2017, Salesforce.com, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of Salesforce.com nor the names of its contributors may + * be used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +package com.salesforce.op.stages.impl.classification + +import com.salesforce.op.UID +import com.salesforce.op.features.types.{OPVector, Prediction, RealNN} +import com.salesforce.op.stages.impl.CheckIsResponseValues +import com.salesforce.op.stages.sparkwrappers.specific.{OpPredictorWrapper, OpPredictorWrapperModel} +import com.salesforce.op.utils.reflection.ReflectionUtils.reflectMethod +import org.apache.spark.ml.classification.{LinearSVC, LinearSVCModel, OpLinearSVCParams} +import org.apache.spark.ml.linalg.Vector + +import scala.reflect.runtime.universe.TypeTag + +/** + * Wrapper for spark Linear SVC [[org.apache.spark.ml.classification.LinearSVC]] + * @param uid stage uid + */ +class OpLinearSVC(uid: String = UID[OpLinearSVC]) + extends OpPredictorWrapper[LinearSVC, LinearSVCModel]( + predictor = new LinearSVC(), + uid = uid + ) with OpLinearSVCParams { + + override protected def onSetInput(): Unit = { + super.onSetInput() + CheckIsResponseValues(in1, in2) + } + + /** + * Set the regularization parameter. + * Default is 0.0. + * + * @group setParam + */ + def setRegParam(value: Double): this.type = set(regParam, value) + setDefault(regParam -> 0.0) + + /** + * Set the maximum number of iterations. + * Default is 100. + * + * @group setParam + */ + def setMaxIter(value: Int): this.type = set(maxIter, value) + setDefault(maxIter -> 100) + + /** + * Whether to fit an intercept term. + * Default is true. + * + * @group setParam + */ + def setFitIntercept(value: Boolean): this.type = set(fitIntercept, value) + setDefault(fitIntercept -> true) + + /** + * Set the convergence tolerance of iterations. + * Smaller values will lead to higher accuracy at the cost of more iterations. + * Default is 1E-6. + * + * @group setParam + */ + def setTol(value: Double): this.type = set(tol, value) + setDefault(tol -> 1E-6) + + /** + * Whether to standardize the training features before fitting the model. + * Default is true. + * + * @group setParam + */ + def setStandardization(value: Boolean): this.type = set(standardization, value) + setDefault(standardization -> true) + + /** + * Set the value of param [[weightCol]]. + * If this is not set or empty, we treat all instance weights as 1.0. + * Default is not set, so all instances have weight one. + * + * @group setParam + */ + def setWeightCol(value: String): this.type = set(weightCol, value) + + /** + * Set threshold in binary classification. + * + * @group setParam + */ + def setThreshold(value: Double): this.type = set(threshold, value) + setDefault(threshold -> 0.0) + + /** + * Suggested depth for treeAggregate (greater than or equal to 2). + * If the dimensions of features or the number of partitions are large, + * this param could be adjusted to a larger size. + * Default is 2. + * + * @group expertSetParam + */ + def setAggregationDepth(value: Int): this.type = set(aggregationDepth, value) + setDefault(aggregationDepth -> 2) +} + + +/** + * Class that takes in a spark LinearSVCModel and wraps it into an OP model which returns a + * Prediction feature + * + * @param sparkModel model to wrap + * @param uid uid to give stage + * @param operationName unique name of the operation this stage performs + */ +class OpLinearSVCModel +( + sparkModel: LinearSVCModel, + uid: String = UID[OpLinearSVCModel], + operationName: String = classOf[LinearSVC].getSimpleName +)( + implicit tti1: TypeTag[RealNN], + tti2: TypeTag[OPVector], + tto: TypeTag[Prediction], + ttov: TypeTag[Prediction#Value] +) extends OpPredictorWrapperModel[LinearSVCModel](uid = uid, operationName = operationName, sparkModel = sparkModel) { + + @transient private lazy val predictRaw = reflectMethod(getSparkMlStage().get, "predictRaw") + @transient private lazy val predict = reflectMethod(getSparkMlStage().get, "predict") + + /** + * Function used to convert input to output + */ + override def transformFn: (RealNN, OPVector) => Prediction = (label, features) => { + val raw = predictRaw.apply(features.value).asInstanceOf[Vector] + val pred = predict.apply(features.value).asInstanceOf[Double] + + Prediction(rawPrediction = raw, prediction = pred) + } +} diff --git a/core/src/main/scala/com/salesforce/op/stages/impl/classification/OpLogisticRegression.scala b/core/src/main/scala/com/salesforce/op/stages/impl/classification/OpLogisticRegression.scala index 61766580f1..1b37316735 100644 --- a/core/src/main/scala/com/salesforce/op/stages/impl/classification/OpLogisticRegression.scala +++ b/core/src/main/scala/com/salesforce/op/stages/impl/classification/OpLogisticRegression.scala @@ -32,60 +32,52 @@ package com.salesforce.op.stages.impl.classification import com.salesforce.op.UID +import com.salesforce.op.features.types.{OPVector, Prediction, RealNN} import com.salesforce.op.stages.impl.CheckIsResponseValues -import com.salesforce.op.stages.sparkwrappers.specific.OpProbabilisticClassifierWrapper -import org.apache.spark.ml.classification.{LogisticRegression, LogisticRegressionModel} +import com.salesforce.op.stages.sparkwrappers.specific.{OpPredictorWrapper, OpProbabilisticClassifierModel} +import com.salesforce.op.utils.reflection.ReflectionUtils.reflectMethod +import org.apache.spark.ml.classification.{LogisticRegression, LogisticRegressionModel, OpLogisticRegressionParams} +import org.apache.spark.ml.linalg.{Matrix, Vector} + +import scala.reflect.runtime.universe.TypeTag /** - * Wrapper around spark ml logistic regression for use with OP pipelines + * Wrapper around spark ml logistic regression [[org.apache.spark.ml.classification.LogisticRegression]] */ class OpLogisticRegression(uid: String = UID[OpLogisticRegression]) - extends OpProbabilisticClassifierWrapper[LogisticRegression, LogisticRegressionModel]( - new LogisticRegression(), + extends OpPredictorWrapper[LogisticRegression, LogisticRegressionModel]( + predictor = new LogisticRegression(), uid = uid - ) { + ) with OpLogisticRegressionParams { override protected def onSetInput(): Unit = { super.onSetInput() CheckIsResponseValues(in1, in2) } - /** - * Set thresholds in multiclass (or binary) classification to adjust the probability of - * predicting each class. Array must have length equal to the number of classes, with values >= 0. - * The class with largest value p/t is predicted, where p is the original probability of that - * class and t is the class' threshold. - * - * @group setParam - */ - def setThresholds(value: Array[Double]): this.type = { - getSparkStage.setThresholds(value) - this - } - /** * Set the regularization parameter. * Default is 0.0. * * @group setParam */ - def setRegParam(value: Double): this.type = { - getSparkStage.setRegParam(value) - this - } + def setRegParam(value: Double): this.type = set(regParam, value) + setDefault(regParam -> 0.0) /** * Set the ElasticNet mixing parameter. - * For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty. - * For 0 < alpha < 1, the penalty is a combination of L1 and L2. + * For alpha = 0, the penalty is an L2 penalty. + * For alpha = 1, it is an L1 penalty. + * For alpha in (0,1), the penalty is a combination of L1 and L2. * Default is 0.0 which is an L2 penalty. * + * Note: Fitting under bound constrained optimization only supports L2 regularization, + * so throws exception if this param is non-zero value. + * * @group setParam */ - def setElasticNetParam(value: Double): this.type = { - getSparkStage.setElasticNetParam(value) - this - } + def setElasticNetParam(value: Double): this.type = set(elasticNetParam, value) + setDefault(elasticNetParam -> 0.0) /** * Set the maximum number of iterations. @@ -93,22 +85,18 @@ class OpLogisticRegression(uid: String = UID[OpLogisticRegression]) * * @group setParam */ - def setMaxIter(value: Int): this.type = { - getSparkStage.setMaxIter(value) - this - } + def setMaxIter(value: Int): this.type = set(maxIter, value) + setDefault(maxIter -> 100) /** * Set the convergence tolerance of iterations. - * Smaller value will lead to higher accuracy with the cost of more iterations. + * Smaller value will lead to higher accuracy at the cost of more iterations. * Default is 1E-6. * * @group setParam */ - def setTol(value: Double): this.type = { - getSparkStage.setTol(value) - this - } + def setTol(value: Double): this.type = set(tol, value) + setDefault(tol -> 1E-6) /** * Whether to fit an intercept term. @@ -116,10 +104,17 @@ class OpLogisticRegression(uid: String = UID[OpLogisticRegression]) * * @group setParam */ - def setFitIntercept(value: Boolean): this.type = { - getSparkStage.setFitIntercept(value) - this - } + def setFitIntercept(value: Boolean): this.type = set(fitIntercept, value) + setDefault(fitIntercept -> true) + + /** + * Sets the value of param [[family]]. + * Default is "auto". + * + * @group setParam + */ + def setFamily(value: String): this.type = set(family, value) + setDefault(family -> "auto") /** * Whether to standardize the training features before fitting the model. @@ -131,21 +126,89 @@ class OpLogisticRegression(uid: String = UID[OpLogisticRegression]) * * @group setParam */ - def setStandardization(value: Boolean): this.type = { - getSparkStage.setStandardization(value) - this - } + def setStandardization(value: Boolean): this.type = set(standardization, value) + setDefault(standardization -> true) + + override def setThreshold(value: Double): this.type = super.setThreshold(value) + /** - * Whether to over-/under-sample training instances according to the given weights in weightCol. - * If not set or empty String, all instances are treated equally (weight 1.0). + * Sets the value of param [[weightCol]]. + * If this is not set or empty, we treat all instance weights as 1.0. * Default is not set, so all instances have weight one. * * @group setParam */ - def setWeightCol(value: String): this.type = { - getSparkStage.setWeightCol(value) - this - } + def setWeightCol(value: String): this.type = set(weightCol, value) + + override def setThresholds(value: Array[Double]): this.type = super.setThresholds(value) + + /** + * Suggested depth for treeAggregate (greater than or equal to 2). + * If the dimensions of features or the number of partitions are large, + * this param could be adjusted to a larger size. + * Default is 2. + * + * @group expertSetParam + */ + def setAggregationDepth(value: Int): this.type = set(aggregationDepth, value) + setDefault(aggregationDepth -> 2) + /** + * Set the lower bounds on coefficients if fitting under bound constrained optimization. + * + * @group expertSetParam + */ + def setLowerBoundsOnCoefficients(value: Matrix): this.type = set(lowerBoundsOnCoefficients, value) + + /** + * Set the upper bounds on coefficients if fitting under bound constrained optimization. + * + * @group expertSetParam + */ + def setUpperBoundsOnCoefficients(value: Matrix): this.type = set(upperBoundsOnCoefficients, value) + + /** + * Set the lower bounds on intercepts if fitting under bound constrained optimization. + * + * @group expertSetParam + */ + def setLowerBoundsOnIntercepts(value: Vector): this.type = set(lowerBoundsOnIntercepts, value) + + /** + * Set the upper bounds on intercepts if fitting under bound constrained optimization. + * + * @group expertSetParam + */ + def setUpperBoundsOnIntercepts(value: Vector): this.type = set(upperBoundsOnIntercepts, value) + +} + + +/** + * Class that takes in a spark LogisticRegressionModel and wraps it into an OP model which returns a + * Prediction feature + * + * @param sparkModel model to wrap + * @param uid uid to give stage + * @param operationName unique name of the operation this stage performs + */ +class OpLogisticRegressionModel +( + sparkModel: LogisticRegressionModel, + operationName: String = classOf[LogisticRegression].getSimpleName, + uid: String = UID[OpLogisticRegressionModel] +)( + implicit tti1: TypeTag[RealNN], + tti2: TypeTag[OPVector], + tto: TypeTag[Prediction], + ttov: TypeTag[Prediction#Value] +) extends OpProbabilisticClassifierModel[LogisticRegressionModel]( + sparkModel = sparkModel, uid = uid, operationName = operationName +) { + @transient lazy val predictRawMirror = reflectMethod(getSparkMlStage().get, "predictRaw") + @transient lazy val raw2probabilityMirror = reflectMethod(getSparkMlStage().get, "raw2probability") + @transient lazy val probability2predictionMirror = + reflectMethod(getSparkMlStage().get, "probability2prediction") } + diff --git a/core/src/main/scala/com/salesforce/op/stages/impl/classification/OpMultilayerPerceptronClassifier.scala b/core/src/main/scala/com/salesforce/op/stages/impl/classification/OpMultilayerPerceptronClassifier.scala new file mode 100644 index 0000000000..1de8560b3b --- /dev/null +++ b/core/src/main/scala/com/salesforce/op/stages/impl/classification/OpMultilayerPerceptronClassifier.scala @@ -0,0 +1,148 @@ +/* + * Copyright (c) 2017, Salesforce.com, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of Salesforce.com nor the names of its contributors may + * be used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +package com.salesforce.op.stages.impl.classification + +import com.salesforce.op.UID +import com.salesforce.op.features.types.{OPVector, Prediction, RealNN} +import com.salesforce.op.stages.impl.CheckIsResponseValues +import com.salesforce.op.stages.sparkwrappers.specific.{OpPredictionModel, OpPredictorWrapper} +import com.salesforce.op.utils.reflection.ReflectionUtils.reflectMethod +import org.apache.spark.ml.classification.{MultilayerPerceptronClassificationModel, MultilayerPerceptronClassifier, OpMultilayerPerceptronClassifierParams} +import org.apache.spark.ml.linalg.Vector + +import scala.reflect.runtime.universe.TypeTag + +/** + * Wrapper for spark MultiLayerPerceptronClassifier + * [[org.apache.spark.ml.classification.MultilayerPerceptronClassifier]] + * @param uid stage uid + */ +class OpMultilayerPerceptronClassifier(uid: String = UID[OpMultilayerPerceptronClassifier]) + extends OpPredictorWrapper[MultilayerPerceptronClassifier, MultilayerPerceptronClassificationModel]( + predictor = new MultilayerPerceptronClassifier(), + uid = uid + ) with OpMultilayerPerceptronClassifierParams { + + override protected def onSetInput(): Unit = { + super.onSetInput() + CheckIsResponseValues(in1, in2) + } + + /** + * Sets the value of param [[layers]]. + * + * @group setParam + */ + def setLayers(value: Array[Int]): this.type = set(layers, value) + + /** + * Sets the value of param [[blockSize]]. + * Default is 128. + * + * @group expertSetParam + */ + def setBlockSize(value: Int): this.type = set(blockSize, value) + + /** + * Sets the value of param [[solver]]. + * Default is "l-bfgs". + * + * @group expertSetParam + */ + def setSolver(value: String): this.type = set(solver, value) + + /** + * Set the maximum number of iterations. + * Default is 100. + * + * @group setParam + */ + def setMaxIter(value: Int): this.type = set(maxIter, value) + + /** + * Set the convergence tolerance of iterations. + * Smaller value will lead to higher accuracy with the cost of more iterations. + * Default is 1E-6. + * + * @group setParam + */ + def setTol(value: Double): this.type = set(tol, value) + + /** + * Set the seed for weights initialization if weights are not set + * + * @group setParam + */ + def setSeed(value: Long): this.type = set(seed, value) + + /** + * Sets the value of param [[initialWeights]]. + * + * @group expertSetParam + */ + def setInitialWeights(value: Vector): this.type = set(initialWeights, value) + + /** + * Sets the value of param [[stepSize]] (applicable only for solver "gd"). + * Default is 0.03. + * + * @group setParam + */ + def setStepSize(value: Double): this.type = set(stepSize, value) +} + + +/** + * Class that takes in a spark MultilayerPerceptronClassificationModel and wraps it into an OP model which returns a + * Prediction feature + * + * @param sparkModel model to wrap + * @param uid uid to give stage + * @param operationName unique name of the operation this stage performs + */ +// TODO in next release of spark this will be probablistic classifier +class OpMultilayerPerceptronClassificationModel +( + sparkModel: MultilayerPerceptronClassificationModel, + uid: String = UID[OpMultilayerPerceptronClassificationModel], + operationName: String = classOf[MultilayerPerceptronClassifier].getSimpleName +)( + implicit tti1: TypeTag[RealNN], + tti2: TypeTag[OPVector], + tto: TypeTag[Prediction], + ttov: TypeTag[Prediction#Value] +) extends OpPredictionModel[MultilayerPerceptronClassificationModel]( + sparkModel = sparkModel, uid = uid, operationName = operationName +) { + @transient lazy val predictMirror = reflectMethod(getSparkMlStage().get, "predict") +} + diff --git a/core/src/main/scala/com/salesforce/op/stages/impl/classification/OpNaiveBayes.scala b/core/src/main/scala/com/salesforce/op/stages/impl/classification/OpNaiveBayes.scala new file mode 100644 index 0000000000..c935c3b6b4 --- /dev/null +++ b/core/src/main/scala/com/salesforce/op/stages/impl/classification/OpNaiveBayes.scala @@ -0,0 +1,113 @@ +/* + * Copyright (c) 2017, Salesforce.com, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of Salesforce.com nor the names of its contributors may + * be used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +package com.salesforce.op.stages.impl.classification + +import com.salesforce.op.UID +import com.salesforce.op.features.types.{OPVector, Prediction, RealNN} +import com.salesforce.op.stages.impl.CheckIsResponseValues +import com.salesforce.op.stages.sparkwrappers.specific.{OpPredictorWrapper, OpProbabilisticClassifierModel} +import com.salesforce.op.utils.reflection.ReflectionUtils.reflectMethod +import org.apache.spark.ml.classification.{NaiveBayes, NaiveBayesModel, OpNaiveBayesParams} + +import scala.reflect.runtime.universe.TypeTag + +/** + * Wrapper for spark Naive Bayes [[org.apache.spark.ml.classification.NaiveBayesModel]] + * @param uid stage uid + */ +class OpNaiveBayes(uid: String = UID[OpNaiveBayes]) + extends OpPredictorWrapper[NaiveBayes, NaiveBayesModel]( + predictor = new NaiveBayes(), + uid = uid + ) with OpNaiveBayesParams { + + override protected def onSetInput(): Unit = { + super.onSetInput() + CheckIsResponseValues(in1, in2) + } + + /** + * Set the smoothing parameter. + * Default is 1.0. + * @group setParam + */ + def setSmoothing(value: Double): this.type = set(smoothing, value) + setDefault(smoothing -> 1.0) + + /** + * Set the model type using a string (case-sensitive). + * Supported options: "multinomial" and "bernoulli". + * Default is "multinomial" + * @group setParam + */ + def setModelType(value: String): this.type = set(modelType, value) + setDefault(modelType -> "multinomial") + + /** + * Sets the value of param [[weightCol]]. + * If this is not set or empty, we treat all instance weights as 1.0. + * Default is not set, so all instances have weight one. + * + * @group setParam + */ + def setWeightCol(value: String): this.type = set(weightCol, value) +} + + +/** + * Class that takes in a spark NaiveBayesModel and wraps it into an OP model which returns a + * Prediction feature + * + * @param sparkModel model to wrap + * @param uid uid to give stage + * @param operationName unique name of the operation this stage performs + */ +class OpNaiveBayesModel +( + sparkModel: NaiveBayesModel, + uid: String = UID[OpNaiveBayesModel], + operationName: String = classOf[NaiveBayes].getSimpleName +)( + implicit tti1: TypeTag[RealNN], + tti2: TypeTag[OPVector], + tto: TypeTag[Prediction], + ttov: TypeTag[Prediction#Value] +) extends OpProbabilisticClassifierModel[NaiveBayesModel]( + sparkModel = sparkModel, uid = uid, operationName = operationName +) { + @transient lazy val predictRawMirror = reflectMethod(getSparkMlStage().get, "predictRaw") + @transient lazy val raw2probabilityMirror = reflectMethod(getSparkMlStage().get, "raw2probability") + @transient lazy val probability2predictionMirror = + reflectMethod(getSparkMlStage().get, "probability2prediction") +} + + diff --git a/core/src/main/scala/com/salesforce/op/stages/impl/classification/OpRandomForest.scala b/core/src/main/scala/com/salesforce/op/stages/impl/classification/OpRandomForest.scala deleted file mode 100644 index ec9c61e525..0000000000 --- a/core/src/main/scala/com/salesforce/op/stages/impl/classification/OpRandomForest.scala +++ /dev/null @@ -1,171 +0,0 @@ -/* - * Copyright (c) 2017, Salesforce.com, Inc. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - * - * 3. Neither the name of Salesforce.com nor the names of its contributors may - * be used to endorse or promote products derived from this software without - * specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ - -package com.salesforce.op.stages.impl.classification - -import com.salesforce.op.UID -import com.salesforce.op.stages.impl.CheckIsResponseValues -import com.salesforce.op.stages.sparkwrappers.specific.OpProbabilisticClassifierWrapper -import enumeratum.{Enum, EnumEntry} -import org.apache.spark.ml.classification.{RandomForestClassificationModel, RandomForestClassifier} - -sealed abstract class Impurity(val sparkName: String) extends EnumEntry with Serializable - -object Impurity extends Enum[Impurity] { - val values: Seq[Impurity] = findValues - - case object Entropy extends Impurity("entropy") - case object Gini extends Impurity("gini") - case object Variance extends Impurity("variance") -} - - -class OpRandomForest(uid: String = UID[OpRandomForest]) - extends OpProbabilisticClassifierWrapper[RandomForestClassifier, RandomForestClassificationModel]( - probClassifier = new RandomForestClassifier, - uid = uid - ) -{ - - override protected def onSetInput(): Unit = { - super.onSetInput() - CheckIsResponseValues(in1, in2) - } - - /** - * Set maximum depth of the tree (>= 0). - * E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes. - * (default = 5) - * - * @group setParam - */ - def setMaxDepth(value: Int): this.type = { - getSparkStage.setMaxDepth(value) - this - } - - /** - * Set maximum number of bins used for discretizing continuous features and for choosing how to split - * on features at each node. More bins give higher granularity. - * Must be >= 2 and >= number of categories in any categorical feature. - * (default = 32) - * - * @group setParam - */ - def setMaxBins(value: Int): this.type = { - getSparkStage.setMaxBins(value) - this - } - - /** - * Set minimum number of instances each child must have after split. - * If a split causes the left or right child to have fewer than minInstancesPerNode, - * the split will be discarded as invalid. - * Should be >= 1. - * (default = 1) - * - * @group setParam - */ - def setMinInstancesPerNode(value: Int): this.type = { - getSparkStage.setMinInstancesPerNode(value) - this - } - - /** - * Set minimum information gain for a split to be considered at a tree node. - * (default = 0.0) - * - * @group setParam - */ - def setMinInfoGain(value: Double): this.type = { - getSparkStage.setMinInfoGain(value) - this - } - - /** - * Set fraction of the training data used for learning each decision tree, in range (0, 1]. - * (default = 1.0) - * - * @group setParam - */ - def setSubsamplingRate(value: Double): this.type = { - getSparkStage.setSubsamplingRate(value) - this - } - - /** - * Set number of trees to train (>= 1). - * If 1, then no bootstrapping is used. If > 1, then bootstrapping is done. - * (default = 20) - * - * @group setParam - */ - def setNumTrees(value: Int): this.type = { - getSparkStage.setNumTrees(value) - this - } - - /** - * Set criterion used for information gain calculation (case-insensitive). - * Supported: "entropy" and "gini". - * (default = gini) - * - * @group setParam - */ - def setImpurity(value: Impurity): this.type = { - getSparkStage.setImpurity(value.sparkName) - this - } - - /** - * Set param for random seed. - * - * @group setParam - */ - def setSeed(value: Long): this.type = { - getSparkStage.setSeed(value) - this - } - - /** - * Set thresholds in multiclass (or binary) classification to adjust the probability of - * predicting each class. Array must have length equal to the number of classes, with values >= 0. - * The class with largest value p/t is predicted, where p is the original probability of that - * class and t is the class' threshold. - * - * @group setParam - */ - def setThresholds(value: Array[Double]): this.type = { - getSparkStage.setThresholds(value) - this - } -} - diff --git a/core/src/main/scala/com/salesforce/op/stages/impl/classification/OpRandomForestClassifier.scala b/core/src/main/scala/com/salesforce/op/stages/impl/classification/OpRandomForestClassifier.scala new file mode 100644 index 0000000000..74215dba51 --- /dev/null +++ b/core/src/main/scala/com/salesforce/op/stages/impl/classification/OpRandomForestClassifier.scala @@ -0,0 +1,160 @@ +/* + * Copyright (c) 2017, Salesforce.com, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of Salesforce.com nor the names of its contributors may + * be used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +package com.salesforce.op.stages.impl.classification + +import com.salesforce.op.UID +import com.salesforce.op.features.types.{OPVector, Prediction, RealNN} +import com.salesforce.op.stages.impl.CheckIsResponseValues +import com.salesforce.op.stages.sparkwrappers.specific.{OpPredictorWrapper, OpProbabilisticClassifierModel} +import com.salesforce.op.utils.reflection.ReflectionUtils.reflectMethod +import enumeratum.{Enum, EnumEntry} +import org.apache.spark.ml.classification.{OpRandomForestClassifierParams, RandomForestClassificationModel, RandomForestClassifier} + +import scala.reflect.runtime.universe.TypeTag + +sealed abstract class Impurity(val sparkName: String) extends EnumEntry with Serializable + +object Impurity extends Enum[Impurity] { + val values: Seq[Impurity] = findValues + + case object Entropy extends Impurity("entropy") + case object Gini extends Impurity("gini") + case object Variance extends Impurity("variance") +} + + +/** + * Wrapper for spark Random Forest Classifier [[org.apache.spark.ml.classification.RandomForestClassifier]] + * @param uid stage uid + */ +class OpRandomForestClassifier(uid: String = UID[OpRandomForestClassifier]) + extends OpPredictorWrapper[RandomForestClassifier, RandomForestClassificationModel]( + predictor = new RandomForestClassifier(), + uid = uid + ) with OpRandomForestClassifierParams { + + override protected def onSetInput(): Unit = { + super.onSetInput() + CheckIsResponseValues(in1, in2) + } + + // Parameters from TreeClassifierParams: + + /** @group setParam */ + override def setMaxDepth(value: Int): this.type = set(maxDepth, value) + + /** @group setParam */ + override def setMaxBins(value: Int): this.type = set(maxBins, value) + + /** @group setParam */ + override def setMinInstancesPerNode(value: Int): this.type = set(minInstancesPerNode, value) + + /** @group setParam */ + override def setMinInfoGain(value: Double): this.type = set(minInfoGain, value) + + /** @group expertSetParam */ + override def setMaxMemoryInMB(value: Int): this.type = set(maxMemoryInMB, value) + + /** @group expertSetParam */ + override def setCacheNodeIds(value: Boolean): this.type = set(cacheNodeIds, value) + + /** + * Specifies how often to checkpoint the cached node IDs. + * E.g. 10 means that the cache will get checkpointed every 10 iterations. + * This is only used if cacheNodeIds is true and if the checkpoint directory is set in + * [[org.apache.spark.SparkContext]]. + * Must be at least 1. + * (default = 10) + * @group setParam + */ + override def setCheckpointInterval(value: Int): this.type = set(checkpointInterval, value) + + /** @group setParam */ + override def setImpurity(value: String): this.type = set(impurity, value) + + // Parameters from TreeEnsembleParams: + + /** @group setParam */ + override def setSubsamplingRate(value: Double): this.type = set(subsamplingRate, value) + + /** @group setParam */ + override def setSeed(value: Long): this.type = set(seed, value) + + // Parameters from RandomForestParams: + + /** @group setParam */ + override def setNumTrees(value: Int): this.type = set(numTrees, value) + + /** @group setParam */ + override def setFeatureSubsetStrategy(value: String): this.type = + set(featureSubsetStrategy, value) + + /** + * Param for Thresholds in multi-class classification to adjust the probability of predicting each class. + * Array must have length equal to the number of classes, with values > 0 excepting that at most one value + * may be 0. The class with largest value p/t is predicted, where p is the original probability of that class + * and t is the class's threshold. + * @group param + */ + def setThresholds(value: Array[Double]): this.type = set(thresholds, value) + +} + + +/** + * Class that takes in a spark RandomForestClassificationModel and wraps it into an OP model which returns a + * Prediction feature + * + * @param sparkModel model to wrap + * @param uid uid to give stage + * @param operationName unique name of the operation this stage performs + */ +class OpRandomForestClassificationModel +( + sparkModel: RandomForestClassificationModel, + uid: String = UID[OpRandomForestClassificationModel], + operationName: String = classOf[RandomForestClassifier].getSimpleName +)( + implicit tti1: TypeTag[RealNN], + tti2: TypeTag[OPVector], + tto: TypeTag[Prediction], + ttov: TypeTag[Prediction#Value] +) extends OpProbabilisticClassifierModel[RandomForestClassificationModel]( + sparkModel = sparkModel, uid = uid, operationName = operationName +) { + @transient lazy val predictRawMirror = reflectMethod(getSparkMlStage().get, "predictRaw") + @transient lazy val raw2probabilityMirror = reflectMethod(getSparkMlStage().get, "raw2probability") + @transient lazy val probability2predictionMirror = + reflectMethod(getSparkMlStage().get, "probability2prediction") +} + + diff --git a/core/src/main/scala/com/salesforce/op/stages/impl/feature/DecisionTreeNumericMapBucketizer.scala b/core/src/main/scala/com/salesforce/op/stages/impl/feature/DecisionTreeNumericMapBucketizer.scala index 995b481c2b..168e2bf6a6 100644 --- a/core/src/main/scala/com/salesforce/op/stages/impl/feature/DecisionTreeNumericMapBucketizer.scala +++ b/core/src/main/scala/com/salesforce/op/stages/impl/feature/DecisionTreeNumericMapBucketizer.scala @@ -78,23 +78,24 @@ class DecisionTreeNumericMapBucketizer[N, I2 <: OPMap[N]] label -> filterKeys[N](map, shouldCleanKey = shouldCleanKeys, shouldCleanValue = shouldCleanValues) }.persist() - require(!ds.isEmpty, "Dataset is empty, buckets cannot be computed.") - - // Collect all unique map keys and sort them - val uniqueKeys: Seq[String] = - ds.map { case (_, map) => map.keys.toSeq } - .reduce((l, r) => (l ++ r).distinct) - .distinct.sorted - - // Compute splits for each collected key in parallel - val computedSplits: Array[(String, Splits)] = + val computedSplits: Array[(String, Splits)] = if (ds.isEmpty) { + log.info("Skip bucketizing empty numeric map '{}' feature", in2.name) + Array.empty[(String, Splits)] + } else { + // Collect all unique map keys and sort them + val uniqueKeys: Seq[String] = + ds.map { case (_, map) => map.keys.toSeq } + .reduce((l, r) => (l ++ r).distinct) + .distinct.sorted + + // Compute splits for each collected key in parallel uniqueKeys.par.map { k => val data: Dataset[(Double, Double)] = ds.filter(_._2.contains(k)) .map { case (label, map) => label.get -> nev.toDouble(map(k)) } k -> computeSplits(data, featureName = s"${in2.name}[$k]") }.toArray - + } ds.unpersist() val meta = makeMetadata(computedSplits) diff --git a/core/src/main/scala/com/salesforce/op/stages/impl/feature/MimeTypeDetector.scala b/core/src/main/scala/com/salesforce/op/stages/impl/feature/MimeTypeDetector.scala index d0979ecfd0..67f87c9109 100644 --- a/core/src/main/scala/com/salesforce/op/stages/impl/feature/MimeTypeDetector.scala +++ b/core/src/main/scala/com/salesforce/op/stages/impl/feature/MimeTypeDetector.scala @@ -38,7 +38,7 @@ import com.salesforce.op.UID import com.salesforce.op.features.types._ import com.salesforce.op.stages.base.unary.UnaryTransformer import org.apache.commons.io.input.BoundedInputStream -import org.apache.spark.ml.param.{Param, Params} +import org.apache.spark.ml.param.{LongParam, Param, Params} import org.apache.tika.detect.{DefaultDetector, Detector} import org.apache.tika.metadata.{HttpHeaders, Metadata} import org.apache.tika.mime.MediaType @@ -84,7 +84,7 @@ private[op] trait MimeTypeDetectorParams extends Params { ) def setTypeHint(value: String): this.type = set(typeHint, value) - final val maxBytesToParse = new Param[Long]( + final val maxBytesToParse = new LongParam( parent = this, name = "maxBytesToParse", doc = "maximum number of bytes to parse during detection", isValid = (v: Long) => v >= 0L ) diff --git a/core/src/main/scala/com/salesforce/op/stages/impl/feature/NameEntityRecognizer.scala b/core/src/main/scala/com/salesforce/op/stages/impl/feature/NameEntityRecognizer.scala new file mode 100644 index 0000000000..b3e5fd927c --- /dev/null +++ b/core/src/main/scala/com/salesforce/op/stages/impl/feature/NameEntityRecognizer.scala @@ -0,0 +1,102 @@ +/* + * Copyright (c) 2017, Salesforce.com, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of Salesforce.com nor the names of its contributors may + * be used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +package com.salesforce.op.stages.impl.feature + +import com.salesforce.op.UID +import com.salesforce.op.features.types._ +import com.salesforce.op.stages.base.unary.UnaryTransformer +import com.salesforce.op.utils.text._ +import com.twitter.algebird.Operators._ + +import scala.reflect.runtime.universe.TypeTag + +/** + * Name Entity [[NameEntityType]] text recognizer. + * + * Note: when providing your own the analyzer/splitter/tagger make sure they can work together, + * for instance OpenNLP models require their own analyzers to be provided when tokenizing. + * The returned feature type is a [[MultiPickListMap]] which contains sets of entities for all the tokens + * + * @param languageDetector a language detector instance (defaults to [[OptimaizeLanguageDetector]] + * @param analyzer a text analyzer instance (defaults to a [[OpenNLPAnalyzer]]) + * @param sentenceSplitter a sentence splitter instance (defaults to a [[OpenNLPSentenceSplitter]]) + * @param tagger name entity tagger (defaults to [[OpenNLPNameEntityTagger]]) + * @param uid uid for instance + * @param tti type tag for input feature type + * @tparam T text feature type + */ +class NameEntityRecognizer[T <: Text] +( + val languageDetector: LanguageDetector = NameEntityRecognizer.LanguageDetector, + val analyzer: TextAnalyzer = NameEntityRecognizer.Analyzer, + val sentenceSplitter: SentenceSplitter = NameEntityRecognizer.Splitter, + val tagger: NameEntityTagger[_ <: TaggerResult] = NameEntityRecognizer.Tagger, + uid: String = UID[NameEntityRecognizer[_]] +)(implicit tti: TypeTag[T]) + extends UnaryTransformer[T, MultiPickListMap](uid = uid, operationName = "nameEntityRec") + with LanguageDetectionParams { + + setDefault( + autoDetectLanguage -> NameEntityRecognizer.AutoDetectLanguage, + autoDetectThreshold -> NameEntityRecognizer.AutoDetectThreshold, + defaultLanguage -> NameEntityRecognizer.DefaultLanguage.entryName + ) + + def transformFn: T => MultiPickListMap = text => { + val res = TextTokenizer.tokenize( + text = text, + languageDetector = languageDetector, + analyzer = analyzer, + sentenceSplitter = Option(sentenceSplitter), + autoDetectLanguage = getAutoDetectLanguage, + autoDetectThreshold = getAutoDetectThreshold, + defaultLanguage = getDefaultLanguage, + toLowercase = false + ) + val sentenceTags = res.sentences.view.map { sentence => + val tags = tagger.tag(sentence.value, res.language, NameEntityType.values) + tags.tokenTags.mapValues(_.map(_.toString)) + } + sentenceTags.foldLeft(Map.empty[String, Set[String]])(_ + _).toMultiPickListMap + } + +} + +object NameEntityRecognizer { + val Analyzer: TextAnalyzer = new OpenNLPAnalyzer() + val LanguageDetector: LanguageDetector = new OptimaizeLanguageDetector() + val Tagger: NameEntityTagger[_ <: TaggerResult] = new OpenNLPNameEntityTagger() + val Splitter: SentenceSplitter = new OpenNLPSentenceSplitter() + val AutoDetectLanguage = false + val AutoDetectThreshold = 0.99 + val DefaultLanguage: Language = Language.English +} diff --git a/core/src/main/scala/com/salesforce/op/stages/impl/feature/OPCollectionHashingVectorizer.scala b/core/src/main/scala/com/salesforce/op/stages/impl/feature/OPCollectionHashingVectorizer.scala index 172857267d..28310f5c3a 100644 --- a/core/src/main/scala/com/salesforce/op/stages/impl/feature/OPCollectionHashingVectorizer.scala +++ b/core/src/main/scala/com/salesforce/op/stages/impl/feature/OPCollectionHashingVectorizer.scala @@ -36,7 +36,9 @@ import com.salesforce.op.features.TransientFeature import com.salesforce.op.features.types._ import com.salesforce.op.stages.OpPipelineStageBase import com.salesforce.op.stages.base.sequence.SequenceTransformer +import com.salesforce.op.stages.impl.feature.HashSpaceStrategy.findValues import com.salesforce.op.utils.spark.{OpVectorColumnMetadata, OpVectorMetadata} +import enumeratum.{Enum, EnumEntry} import org.apache.spark.ml.linalg.{DenseVector, SparseVector} import org.apache.spark.ml.param._ import org.apache.spark.mllib.feature.HashingTF @@ -97,6 +99,16 @@ class OPCollectionHashingVectorizer[T <: OPCollection](uid: String = UID[OPColle } } +sealed trait HashSpaceStrategy extends EnumEntry with Serializable + +object HashSpaceStrategy extends Enum[HashSpaceStrategy] { + val values: Seq[HashSpaceStrategy] = findValues + + case object Shared extends HashSpaceStrategy + case object Separate extends HashSpaceStrategy + case object Auto extends HashSpaceStrategy +} + private[op] trait HashingVectorizerParams extends Params { final val numFeatures = new IntParam( parent = this, name = "numFeatures", @@ -115,11 +127,22 @@ private[op] trait HashingVectorizerParams extends Params { ) def setHashWithIndex(v: Boolean): this.type = set(hashWithIndex, v) + @deprecated("Functionality replaced by hashSpaceStrategy", "3.3.0") final val forceSharedHashSpace = new BooleanParam( parent = this, name = "forceSharedHashSpace", doc = s"if true, then force the hash space to be shared among all included features" ) + @deprecated("Functionality replaced by hashSpaceStrategy", "3.3.0") def setForceSharedHashSpace(v: Boolean): this.type = set(forceSharedHashSpace, v) + @deprecated("Functionality replaced by hashSpaceStrategy", "3.3.0") + def getForceSharedHashSpace: Boolean = $(forceSharedHashSpace) + + final val hashSpaceStrategy: Param[String] = new Param[String](this, "hashSpaceStrategy", + "Strategy to determine whether to use shared or separate hash space for input text features", + (value: String) => HashSpaceStrategy.withNameInsensitiveOption(value).isDefined + ) + def setHashSpaceStrategy(v: HashSpaceStrategy): this.type = set(hashSpaceStrategy, v.entryName) + def getHashSpaceStrategy: HashSpaceStrategy = HashSpaceStrategy.withNameInsensitive($(hashSpaceStrategy)) final val prependFeatureName = new BooleanParam( parent = this, name = "prependFeatureName", @@ -132,6 +155,7 @@ private[op] trait HashingVectorizerParams extends Params { isValid = (s: String) => HashAlgorithm.withNameInsensitiveOption(s).isDefined ) def setHashAlgorithm(h: HashAlgorithm): this.type = set(hashAlgorithm, h.toString.toLowerCase) + def getHashAlgorithm: HashAlgorithm = HashAlgorithm.withNameInsensitive($(hashAlgorithm)) final val binaryFreq = new BooleanParam( parent = this, name = "binaryFreq", @@ -145,7 +169,8 @@ private[op] trait HashingVectorizerParams extends Params { forceSharedHashSpace -> false, prependFeatureName -> TransmogrifierDefaults.PrependFeatureName, hashAlgorithm -> TransmogrifierDefaults.HashAlgorithm.toString.toLowerCase, - binaryFreq -> TransmogrifierDefaults.BinaryFreq + binaryFreq -> TransmogrifierDefaults.BinaryFreq, + hashSpaceStrategy -> HashSpaceStrategy.Auto.toString ) } @@ -162,6 +187,7 @@ private[op] trait HashingVectorizerParams extends Params { * @param binaryFreq if true, term frequency vector will be binary such that non-zero term counts * will be set to 1.0 * @param hashAlgorithm hash algorithm to use + * @param hashSpaceStrategy strategy to determine whether to use shared hash space for all included features */ case class HashingFunctionParams ( @@ -172,7 +198,8 @@ case class HashingFunctionParams maxNumOfFeatures: Int, forceSharedHashSpace: Boolean, binaryFreq: Boolean, - hashAlgorithm: HashAlgorithm + hashAlgorithm: HashAlgorithm, + hashSpaceStrategy: HashSpaceStrategy = HashSpaceStrategy.Auto ) /** @@ -189,7 +216,12 @@ private[op] trait HashingFun { protected def isSharedHashSpace(p: HashingFunctionParams, numFeatures: Option[Int] = None): Boolean = { val numHashes = p.numFeatures val numOfFeatures = numFeatures.getOrElse(p.numInputs) - (numHashes * numOfFeatures) > p.maxNumOfFeatures || p.forceSharedHashSpace + import HashSpaceStrategy._ + p.hashSpaceStrategy match { + case s if p.forceSharedHashSpace || s.equals(Shared) => true + case Separate => false + case Auto => (numHashes * numOfFeatures) > p.maxNumOfFeatures + } } /** diff --git a/core/src/main/scala/com/salesforce/op/stages/impl/feature/OPMapVectorizer.scala b/core/src/main/scala/com/salesforce/op/stages/impl/feature/OPMapVectorizer.scala index 8c3cce7aed..d83ea2dddc 100644 --- a/core/src/main/scala/com/salesforce/op/stages/impl/feature/OPMapVectorizer.scala +++ b/core/src/main/scala/com/salesforce/op/stages/impl/feature/OPMapVectorizer.scala @@ -451,7 +451,7 @@ final class TextMapHashingVectorizerModel[T <: OPMap[String]] private[op] val keys = args.allKeys(i) val cleaned = cleanMap(map.v, shouldCleanKey = args.shouldCleanKeys, shouldCleanValue = args.shouldCleanValues) val mapValues = cleaned.map { case (k, v) => v.toText } - mapValues.map(tokenize(_)._2).toSeq + mapValues.map(tokenize(_).tokens).toSeq } val allTokens = tokenSeq.flatMap(_.value).toTextList diff --git a/core/src/main/scala/com/salesforce/op/stages/impl/feature/RealNNVectorizer.scala b/core/src/main/scala/com/salesforce/op/stages/impl/feature/RealNNVectorizer.scala index f8a4492c05..11f16ae80b 100644 --- a/core/src/main/scala/com/salesforce/op/stages/impl/feature/RealNNVectorizer.scala +++ b/core/src/main/scala/com/salesforce/op/stages/impl/feature/RealNNVectorizer.scala @@ -50,7 +50,7 @@ class RealNNVectorizer /** * Function used to convert input to output */ - override def transformFn: (Seq[RealNN]) => OPVector = in => { + override def transformFn: Seq[RealNN] => OPVector = in => { val ins = in.map(_.value.get) // assumes a non nullable real (RealNN) Vectors.dense(ins.toArray).toOPVector } diff --git a/core/src/main/scala/com/salesforce/op/stages/impl/feature/SmartTextMapVectorizer.scala b/core/src/main/scala/com/salesforce/op/stages/impl/feature/SmartTextMapVectorizer.scala index 58600cec34..cada837d5d 100644 --- a/core/src/main/scala/com/salesforce/op/stages/impl/feature/SmartTextMapVectorizer.scala +++ b/core/src/main/scala/com/salesforce/op/stages/impl/feature/SmartTextMapVectorizer.scala @@ -40,8 +40,8 @@ import com.salesforce.op.utils.spark.RichDataset._ import com.salesforce.op.utils.spark.{OpVectorColumnMetadata, OpVectorMetadata} import com.twitter.algebird.Monoid._ import com.twitter.algebird.Operators._ -import com.twitter.algebird.macros.caseclass import com.twitter.algebird.Semigroup +import com.twitter.algebird.macros.caseclass import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder import org.apache.spark.sql.{Dataset, Encoder} @@ -84,9 +84,10 @@ class SmartTextMapVectorizer[T <: OPMap[String]] numFeatures = $(numFeatures), numInputs = inN.length, maxNumOfFeatures = TransmogrifierDefaults.MaxNumOfFeatures, - forceSharedHashSpace = $(forceSharedHashSpace), + forceSharedHashSpace = getForceSharedHashSpace, binaryFreq = $(binaryFreq), - hashAlgorithm = HashAlgorithm.withNameInsensitive($(hashAlgorithm)) + hashAlgorithm = getHashAlgorithm, + hashSpaceStrategy = getHashSpaceStrategy ) private def makeVectorMetadata(args: SmartTextMapVectorizerModelArgs): OpVectorMetadata = { @@ -244,14 +245,14 @@ final class SmartTextMapVectorizerModel[T <: OPMap[String]] private[op] private def partitionRow(row: Seq[OPMap[String]]): (Seq[OPMap[String]], Seq[Seq[String]], Seq[OPMap[String]], Seq[Seq[String]]) = { val (rowCategorical, keysCategorical) = - row.view.zip(args.categoricalKeys).collect{ case (elements, keys) if keys.nonEmpty => - val filtered = elements.value.filter{ case (k, v) => keys.contains(k) } + row.view.zip(args.categoricalKeys).collect { case (elements, keys) if keys.nonEmpty => + val filtered = elements.value.filter { case (k, v) => keys.contains(k) } (TextMap(filtered), keys) }.unzip val (rowText, keysText) = - row.view.zip(args.textKeys).collect{ case (elements, keys) if keys.nonEmpty => - val filtered = elements.value.filter{ case (k, v) => keys.contains(k) } + row.view.zip(args.textKeys).collect { case (elements, keys) if keys.nonEmpty => + val filtered = elements.value.filter { case (k, v) => keys.contains(k) } (TextMap(filtered), keys) }.unzip @@ -261,17 +262,17 @@ final class SmartTextMapVectorizerModel[T <: OPMap[String]] private[op] def transformFn: Seq[T] => OPVector = row => { val (rowCategorical, keysCategorical, rowText, keysText) = partitionRow(row) val categoricalVector = categoricalPivotFn(rowCategorical) - val rowTextTokenized = rowText.map( m => m.value.map{ case (k, v) => k -> tokenize(v.toText)._2 } ) + val rowTextTokenized = rowText.map(_.value.map { case (k, v) => k -> tokenize(v.toText).tokens }) val textVector = hash(rowTextTokenized, keysText, args.hashingParams) val textNullIndicatorsVector = - if (args.shouldTrackNulls) Seq(getNullIndicatorsVector(keysText, rowText)) else Nil + if (args.shouldTrackNulls) Seq(getNullIndicatorsVector(keysText, rowTextTokenized)) else Nil VectorsCombiner.combineOP(Seq(categoricalVector, textVector) ++ textNullIndicatorsVector) } - private def getNullIndicatorsVector(keysSeq: Seq[Seq[String]], inputs: Seq[OPMap[String]]): OPVector = { + private def getNullIndicatorsVector(keysSeq: Seq[Seq[String]], inputs: Seq[Map[String, TextList]]): OPVector = { val nullIndicators = keysSeq.zip(inputs).flatMap{ case (keys, input) => keys.map{ k => - val nullVal = if (input.value.contains(k)) 0.0 else 1.0 + val nullVal = if (input.get(k).forall(_.isEmpty)) 1.0 else 0.0 Seq(0 -> nullVal) } } @@ -280,3 +281,4 @@ final class SmartTextMapVectorizerModel[T <: OPMap[String]] private[op] vector.toOPVector } } + diff --git a/core/src/main/scala/com/salesforce/op/stages/impl/feature/SmartTextVectorizer.scala b/core/src/main/scala/com/salesforce/op/stages/impl/feature/SmartTextVectorizer.scala index 647882a937..9c847cdc0a 100644 --- a/core/src/main/scala/com/salesforce/op/stages/impl/feature/SmartTextVectorizer.scala +++ b/core/src/main/scala/com/salesforce/op/stages/impl/feature/SmartTextVectorizer.scala @@ -72,9 +72,10 @@ class SmartTextVectorizer[T <: Text](uid: String = UID[SmartTextVectorizer[T]])( numFeatures = $(numFeatures), numInputs = inN.length, maxNumOfFeatures = TransmogrifierDefaults.MaxNumOfFeatures, - forceSharedHashSpace = $(forceSharedHashSpace), + forceSharedHashSpace = getForceSharedHashSpace, binaryFreq = $(binaryFreq), - hashAlgorithm = HashAlgorithm.withNameInsensitive($(hashAlgorithm)) + hashAlgorithm = getHashAlgorithm, + hashSpaceStrategy = getHashSpaceStrategy ) def fitFn(dataset: Dataset[Seq[T#Value]]): SequenceModel[T, OPVector] = { @@ -213,9 +214,9 @@ final class SmartTextVectorizerModel[T <: Text] private[op] (row: Seq[Text]) => { val (rowCategorical, rowText) = SmartTextVectorizer.partition[Text](row.toArray, args.isCategorical) val categoricalVector: OPVector = categoricalPivotFn(rowCategorical) - val textTokens: Seq[TextList] = rowText.map(tokenize(_)._2) + val textTokens: Seq[TextList] = rowText.map(tokenize(_).tokens) val textVector: OPVector = hash[TextList](textTokens, getTextTransientFeatures, args.hashingParams) - val textNullIndicatorsVector = if (args.shouldTrackNulls) Seq(getNullIndicatorsVector(rowText)) else Seq.empty + val textNullIndicatorsVector = if (args.shouldTrackNulls) Seq(getNullIndicatorsVector(textTokens)) else Seq.empty VectorsCombiner.combineOP(Seq(categoricalVector, textVector) ++ textNullIndicatorsVector) } @@ -224,11 +225,9 @@ final class SmartTextVectorizerModel[T <: Text] private[op] private def getTextTransientFeatures: Array[TransientFeature] = SmartTextVectorizer.partition[TransientFeature](getTransientFeatures(), args.isCategorical)._2 - private def getNullIndicatorsVector(features: Seq[Text]): OPVector = { - val nullIndicators = features.map { f => - val theseCat = convertToSet(f) - .groupBy(v => cleanTextFn(v.toString, args.shouldCleanText)).map { case (k, v) => k -> v.size } - val nullVal = if (theseCat.isEmpty) 1.0 else 0.0 + private def getNullIndicatorsVector(textTokens: Seq[TextList]): OPVector = { + val nullIndicators = textTokens.map { tokens => + val nullVal = if (tokens.isEmpty) 1.0 else 0.0 Seq(0 -> nullVal) } val reindexed = reindex(nullIndicators) @@ -243,7 +242,7 @@ trait MaxCardinalityParams extends Params { doc = "max number of distinct values a categorical feature can have", isValid = ParamValidators.inRange(lowerBound = 1, upperBound = SmartTextVectorizer.MaxCardinality) ) + final def setMaxCardinality(v: Int): this.type = set(maxCardinality, v) + final def getMaxCardinality: Int = $(maxCardinality) setDefault(maxCardinality -> SmartTextVectorizer.MaxCardinality) - def setMaxCardinality(v: Int): this.type = set(maxCardinality, v) - def getMaxCardinality: Int = $(maxCardinality) } diff --git a/core/src/main/scala/com/salesforce/op/stages/impl/feature/TextMapNullEstimator.scala b/core/src/main/scala/com/salesforce/op/stages/impl/feature/TextMapNullEstimator.scala index 44a7dbeb8c..a360fb3132 100644 --- a/core/src/main/scala/com/salesforce/op/stages/impl/feature/TextMapNullEstimator.scala +++ b/core/src/main/scala/com/salesforce/op/stages/impl/feature/TextMapNullEstimator.scala @@ -92,15 +92,17 @@ final class TextMapNullModel[T <: OPMap[String]] private[op] uid: String )(implicit tti: TypeTag[T]) extends SequenceModel[T, OPVector](operationName = operationName, uid = uid) - with VectorizerDefaults with CleanTextMapFun { + with VectorizerDefaults with CleanTextMapFun with TextTokenizerParams { def transformFn: Seq[T] => OPVector = row => { row.zipWithIndex.flatMap { case (map, i) => val keys = allKeys(i) val cleaned = cleanMap(map.v, shouldCleanKey = cleanKeys, shouldCleanValue = cleanValues) + val tokenMap = cleaned.mapValues { v => v.toText }.mapValues(tokenize(_).tokens) - keys.map(k => if (cleaned.contains(k)) 0.0 else 1.0) + // Need to check if key is present, and also that our tokenizer will not remove the value + keys.map(k => if (cleaned.contains(k) && tokenMap(k).nonEmpty) 0.0 else 1.0) }.toOPVector } diff --git a/core/src/main/scala/com/salesforce/op/stages/impl/feature/TextTokenizer.scala b/core/src/main/scala/com/salesforce/op/stages/impl/feature/TextTokenizer.scala index fbe114dcba..c7c5147934 100644 --- a/core/src/main/scala/com/salesforce/op/stages/impl/feature/TextTokenizer.scala +++ b/core/src/main/scala/com/salesforce/op/stages/impl/feature/TextTokenizer.scala @@ -34,12 +34,13 @@ package com.salesforce.op.stages.impl.feature import com.salesforce.op.UID import com.salesforce.op.features.types._ import com.salesforce.op.stages.base.unary.UnaryTransformer -import com.salesforce.op.utils.text._ +import com.salesforce.op.stages.impl.feature.TextTokenizer.TextTokenizerResult +import com.salesforce.op.utils.text.{Language, _} import org.apache.spark.ml.param._ import scala.reflect.runtime.universe.TypeTag -trait TextTokenizerParams extends Params { +trait LanguageDetectionParams extends Params { /** * Indicates whether to attempt language detection. @@ -55,6 +56,7 @@ trait TextTokenizerParams extends Params { final val autoDetectThreshold = new DoubleParam(this, "autoDetectThreshold", "language detection threshold", ParamValidators.inRange(0.0, 1.0, true, true)) + def setAutoDetectThreshold(value: Double): this.type = set(autoDetectThreshold, value) def getAutoDetectThreshold: Double = $(autoDetectThreshold) @@ -65,6 +67,11 @@ trait TextTokenizerParams extends Params { def setDefaultLanguage(value: Language): this.type = set(defaultLanguage, value.entryName) def getDefaultLanguage: Language = Language.withName($(defaultLanguage)) +} + + +trait TextTokenizerParams extends LanguageDetectionParams { + /** * Minimum token length, >= 1. */ @@ -82,18 +89,18 @@ trait TextTokenizerParams extends Params { def getToLowercase: Boolean = $(toLowercase) setDefault( + minTokenLength -> TextTokenizer.MinTokenLength, + toLowercase -> TextTokenizer.ToLowercase, autoDetectLanguage -> TextTokenizer.AutoDetectLanguage, autoDetectThreshold -> TextTokenizer.AutoDetectThreshold, - defaultLanguage -> TextTokenizer.DefaultLanguage.entryName, - minTokenLength -> TextTokenizer.MinTokenLength, - toLowercase -> TextTokenizer.ToLowercase + defaultLanguage -> TextTokenizer.DefaultLanguage.entryName ) def tokenize( text: Text, languageDetector: LanguageDetector = TextTokenizer.LanguageDetector, analyzer: TextAnalyzer = TextTokenizer.Analyzer - ): (Language, TextList) = TextTokenizer.tokenize( + ): TextTokenizerResult = TextTokenizer.tokenize( text = text, languageDetector = languageDetector, analyzer = analyzer, @@ -120,7 +127,7 @@ class TextTokenizer[T <: Text] uid: String = UID[TextTokenizer[_]] )(implicit tti: TypeTag[T]) extends UnaryTransformer[T, TextList](operationName = "textToken", uid = uid) with TextTokenizerParams { - def transformFn: T => TextList = text => tokenize(text, languageDetector, analyzer)._2 + def transformFn: T => TextList = text => tokenize(text, languageDetector, analyzer).tokens } object TextTokenizer { @@ -135,41 +142,64 @@ object TextTokenizer { val StripHtml = false /** - * Language wise text tokenization + * Language wise sentence tokenization * * @param text text to tokenize * @param languageDetector language detector instance * @param analyzer text analyzer instance + * @param sentenceSplitter sentence splitter instance * @param autoDetectLanguage whether to attempt language detection * @param defaultLanguage default language * @param autoDetectThreshold language detection threshold * @param toLowercase whether to convert all characters to lowercase before tokenizing * @param minTokenLength minimum token length - * @return detected language and tokens + * @return detected language and sentence tokens */ def tokenize( text: Text, languageDetector: LanguageDetector = LanguageDetector, analyzer: TextAnalyzer = Analyzer, + sentenceSplitter: Option[SentenceSplitter] = None, autoDetectLanguage: Boolean = AutoDetectLanguage, defaultLanguage: Language = DefaultLanguage, autoDetectThreshold: Double = AutoDetectThreshold, toLowercase: Boolean = ToLowercase, minTokenLength: Int = MinTokenLength - ): (Language, TextList) = text match { - case SomeValue(Some(txt)) => - val language = - if (!autoDetectLanguage) defaultLanguage - else { - languageDetector - .detectLanguages(txt) - .collectFirst { case (lang, confidence) if confidence > autoDetectThreshold => lang } - .getOrElse(defaultLanguage) - } - val lowerTxt = if (toLowercase) txt.toLowerCase else txt - val tokens = analyzer.analyze(lowerTxt, language) - language -> tokens.filter(_.length >= minTokenLength).toTextList - case _ => - defaultLanguage -> TextList.empty + ): TextTokenizerResult = { + text match { + case SomeValue(Some(txt)) => + val language = + if (!autoDetectLanguage) defaultLanguage + else { + languageDetector + .detectLanguages(txt) + .collectFirst { case (lang, confidence) if confidence > autoDetectThreshold => lang } + .getOrElse(defaultLanguage) + } + val lowerTxt = if (toLowercase) txt.toLowerCase else txt + + val sentences = sentenceSplitter.map(_.getSentences(lowerTxt, language)) + .getOrElse(Seq(lowerTxt)) + .map { sentence => + val tokens = analyzer.analyze(sentence, language) + tokens.filter(_.length >= minTokenLength).toTextList + } + TextTokenizerResult(language, sentences) + case _ => + TextTokenizerResult(defaultLanguage, Seq(TextList.empty)) + } + } + + /** + * Text tokenization result + * + * @param language detected language + * @param sentences sentence tokens + */ + case class TextTokenizerResult(language: Language, sentences: Seq[TextList]) { + /** + * All sentences tokens flattened together + */ + def tokens: TextList = sentences.flatMap(_.value).toTextList } } diff --git a/core/src/main/scala/com/salesforce/op/stages/impl/feature/Transmogrifier.scala b/core/src/main/scala/com/salesforce/op/stages/impl/feature/Transmogrifier.scala index 08628b9d10..cab57f5174 100644 --- a/core/src/main/scala/com/salesforce/op/stages/impl/feature/Transmogrifier.scala +++ b/core/src/main/scala/com/salesforce/op/stages/impl/feature/Transmogrifier.scala @@ -64,6 +64,7 @@ private[op] trait TransmogrifierDefaults { val HashWithIndex: Boolean = false val PrependFeatureName: Boolean = true val ForceSharedHashSpace: Boolean = true + val HashSpaceStrategy: HashSpaceStrategy = com.salesforce.op.stages.impl.feature.HashSpaceStrategy.Auto val CleanText: Boolean = true val CleanKeys: Boolean = false val HashAlgorithm: HashAlgorithm = com.salesforce.op.stages.impl.feature.HashAlgorithm.MurMur3 @@ -75,6 +76,7 @@ private[op] trait TransmogrifierDefaults { val MinDocFrequency: Int = 0 // Default is to fill missing Geolocations with the mean, but if fillWithConstant is chosen, use this val DefaultGeolocation: Geolocation = Geolocation(0.0, 0.0, GeolocationAccuracy.Unknown) + val MinInfoGain: Double = DecisionTreeNumericBucketizer.MinInfoGain } private[op] object TransmogrifierDefaults extends TransmogrifierDefaults @@ -86,10 +88,12 @@ private[op] case object Transmogrifier { * * @param features input features * @param defaults transmogrifier defaults (allows params injection) + * @param label optional label feature to be passed into stages that require the label column * @return vectorized features grouped by type */ def transmogrify( - features: Seq[FeatureLike[_]] + features: Seq[FeatureLike[_]], + label: Option[FeatureLike[RealNN]] = None )(implicit defaults: TransmogrifierDefaults): Iterable[FeatureLike[OPVector]] = { import defaults._ def castSeqAs[U <: FeatureType](f: Seq[FeatureLike[_]]) = f.map(_.asInstanceOf[FeatureLike[U]]) @@ -141,7 +145,7 @@ private[op] case object Transmogrifier { case t if t =:= weakTypeOf[CurrencyMap] => val (f, other) = castAs[CurrencyMap](g) f.vectorize(defaultValue = FillValue, fillWithMean = FillWithMean, cleanKeys = CleanKeys, others = other, - trackNulls = TrackNulls) + trackNulls = TrackNulls, trackInvalid = TrackInvalid, minInfoGain = MinInfoGain, label = label) case t if t =:= weakTypeOf[DateMap] => val (f, other) = castAs[DateMap](g) // TODO make better default f.vectorize(defaultValue = FillValue, cleanKeys = CleanKeys, others = other, trackNulls = TrackNulls) @@ -159,7 +163,7 @@ private[op] case object Transmogrifier { case t if t =:= weakTypeOf[IntegralMap] => val (f, other) = castAs[IntegralMap](g) f.vectorize(defaultValue = FillValue, fillWithMode = FillWithMode, cleanKeys = CleanKeys, others = other, - trackNulls = TrackNulls) + trackNulls = TrackNulls, trackInvalid = TrackInvalid, minInfoGain = MinInfoGain, label = label) case t if t =:= weakTypeOf[MultiPickListMap] => val (f, other) = castAs[MultiPickListMap](g) f.vectorize(topK = TopK, minSupport = MinSupport, cleanText = CleanText, cleanKeys = CleanKeys, @@ -167,7 +171,7 @@ private[op] case object Transmogrifier { case t if t =:= weakTypeOf[PercentMap] => val (f, other) = castAs[PercentMap](g) f.vectorize(defaultValue = FillValue, fillWithMean = FillWithMean, cleanKeys = CleanKeys, others = other, - trackNulls = TrackNulls) + trackNulls = TrackNulls, trackInvalid = TrackInvalid, minInfoGain = MinInfoGain, label = label) case t if t =:= weakTypeOf[PhoneMap] => val (f, other) = castAs[PhoneMap](g) // TODO make better default f.vectorize(defaultRegion = PhoneNumberParser.DefaultRegion, others = other, trackNulls = TrackNulls) @@ -178,7 +182,7 @@ private[op] case object Transmogrifier { case t if t =:= weakTypeOf[RealMap] => val (f, other) = castAs[RealMap](g) f.vectorize(defaultValue = FillValue, fillWithMean = FillWithMean, cleanKeys = CleanKeys, others = other, - trackNulls = TrackNulls) + trackNulls = TrackNulls, trackInvalid = TrackInvalid, minInfoGain = MinInfoGain, label = label) case t if t =:= weakTypeOf[TextAreaMap] => val (f, other) = castAs[TextAreaMap](g) // Explicitly set cleanText to false here in order to match behavior of Text vectorization @@ -223,7 +227,8 @@ private[op] case object Transmogrifier { f.vectorize(fillValue = BinaryFillValue, trackNulls = TrackNulls, others = other) case t if t =:= weakTypeOf[Currency] => val (f, other) = castAs[Currency](g) - f.vectorize(fillValue = FillValue, fillWithMean = FillWithMean, trackNulls = TrackNulls, others = other) + f.vectorize(fillValue = FillValue, fillWithMean = FillWithMean, trackNulls = TrackNulls, + trackInvalid = TrackInvalid, minInfoGain = MinInfoGain, others = other, label = label) case t if t =:= weakTypeOf[Date] => val (f, other) = castAs[Date](g) f.vectorize(dateListPivot = DateListDefault, referenceDate = ReferenceDate, others = other) @@ -232,13 +237,16 @@ private[op] case object Transmogrifier { f.vectorize(dateListPivot = DateListDefault, referenceDate = ReferenceDate, others = other) case t if t =:= weakTypeOf[Integral] => val (f, other) = castAs[Integral](g) - f.vectorize(fillValue = FillValue, fillWithMode = FillWithMode, trackNulls = TrackNulls, others = other) + f.vectorize(fillValue = FillValue, fillWithMode = FillWithMode, trackNulls = TrackNulls, + trackInvalid = TrackInvalid, minInfoGain = MinInfoGain, others = other, label = label) case t if t =:= weakTypeOf[Percent] => val (f, other) = castAs[Percent](g) - f.vectorize(fillValue = FillValue, fillWithMean = FillWithMean, trackNulls = TrackNulls, others = other) + f.vectorize(fillValue = FillValue, fillWithMean = FillWithMean, trackNulls = TrackNulls, + trackInvalid = TrackInvalid, minInfoGain = MinInfoGain, others = other, label = label) case t if t =:= weakTypeOf[Real] => val (f, other) = castAs[Real](g) - f.vectorize(fillValue = FillValue, fillWithMean = FillWithMean, trackNulls = TrackNulls, others = other) + f.vectorize(fillValue = FillValue, fillWithMean = FillWithMean, trackNulls = TrackNulls, + trackInvalid = TrackInvalid, minInfoGain = MinInfoGain, others = other, label = label) case t if t =:= weakTypeOf[RealNN] => val (f, other) = castAs[RealNN](g) f.vectorize(other) diff --git a/core/src/main/scala/com/salesforce/op/stages/impl/insights/RecordInsightsLOCO.scala b/core/src/main/scala/com/salesforce/op/stages/impl/insights/RecordInsightsLOCO.scala index a0f9642372..24aaf78150 100644 --- a/core/src/main/scala/com/salesforce/op/stages/impl/insights/RecordInsightsLOCO.scala +++ b/core/src/main/scala/com/salesforce/op/stages/impl/insights/RecordInsightsLOCO.scala @@ -34,12 +34,13 @@ package com.salesforce.op.stages.impl.insights import com.salesforce.op.UID import com.salesforce.op.features.types._ import com.salesforce.op.stages.base.unary.UnaryTransformer -import org.apache.spark.ml.Transformer -import org.apache.spark.ml.SparkModelConverter._ -import com.salesforce.op.stages.sparkwrappers.generic.SparkWrapperParams +import com.salesforce.op.stages.sparkwrappers.specific.SparkModelConverter._ import com.salesforce.op.utils.spark.OpVectorMetadata -import org.apache.spark.ml.linalg.{Vector, Vectors} +import org.apache.spark.annotation.Experimental +import org.apache.spark.ml.Model +import org.apache.spark.ml.linalg.Vectors import org.apache.spark.ml.param.IntParam + import scala.collection.mutable.PriorityQueue /** @@ -48,7 +49,8 @@ import scala.collection.mutable.PriorityQueue * @param model model instance that you wish to explain * @param uid uid for instance */ -class RecordInsightsLOCO[T <: SparkWrapperParams[_]] +@Experimental +class RecordInsightsLOCO[T <: Model[T]] ( val model: T, uid: String = UID[RecordInsightsLOCO[_]] @@ -62,7 +64,7 @@ class RecordInsightsLOCO[T <: SparkWrapperParams[_]] def getTopK: Int = $(topK) setDefault(topK -> 20) - private val modelApply = toOP(model.getSparkMlStage().map(_.asInstanceOf[Transformer])).transformFn + private val modelApply = toOPUnchecked(model).transformFn private val labelDummy = RealNN(0.0) private lazy val featureInfo = OpVectorMetadata(getInputSchema()(in1.name)).getColumnHistory().map(_.toJson(false)) diff --git a/core/src/main/scala/com/salesforce/op/stages/impl/preparators/SanityChecker.scala b/core/src/main/scala/com/salesforce/op/stages/impl/preparators/SanityChecker.scala index 9ceafd60fd..5e0a23cff8 100644 --- a/core/src/main/scala/com/salesforce/op/stages/impl/preparators/SanityChecker.scala +++ b/core/src/main/scala/com/salesforce/op/stages/impl/preparators/SanityChecker.scala @@ -162,6 +162,14 @@ trait SanityCheckerParams extends Params { def setRemoveFeatureGroup(value: Boolean): this.type = set(removeFeatureGroup, value) def getRemoveFeatureGroup: Boolean = $(removeFeatureGroup) + final val protectTextSharedHash = new BooleanParam( + parent = this, name = "protectTextSharedHash", + doc = "If true, an individual hash is dropped/kept independently of related null indicators and" + + " other hashes in the same shared hash space." + ) + def setProtectTextSharedHash(value: Boolean): this.type = set(protectTextSharedHash, value) + def getProtectTextSharedHash: Boolean = $(protectTextSharedHash) + final val maxRuleConfidence = new DoubleParam( parent = this, name = "maxRuleConfidence", doc = "Maximum allowed confidence of association rules in categorical variables. A categorical variable will be " + @@ -193,6 +201,7 @@ trait SanityCheckerParams extends Params { maxCramersV -> SanityChecker.MaxCramersV, removeBadFeatures -> SanityChecker.RemoveBadFeatures, removeFeatureGroup -> SanityChecker.RemoveFeatureGroup, + protectTextSharedHash -> SanityChecker.ProtectTextSharedHash, correlationType -> SanityChecker.CorrelationType, maxRuleConfidence -> SanityChecker.MaxRuleConfidence, minRequiredRuleSupport -> SanityChecker.MinRequiredRuleSupport @@ -240,10 +249,10 @@ class SanityChecker(uid: String = UID[SanityChecker]) indicatorGroup <- col.indicatorGroup } yield (indicatorGroup, (col, col.index)) - nullGroups.groupBy(_._1).foreach { - case (group, cols) => - require(cols.length == 1, s"Vector column $group has multiple null indicator fields: $cols") - } + nullGroups.groupBy(_._1).foreach { + case (group, cols) => + require(cols.length == 1, s"Vector column $group has multiple null indicator fields: $cols") + } def maxByParent(seq: Seq[(String, Double)]) = seq.groupBy(_._1).map{ case(k, v) => // Filter out the NaNs because max(3.4, NaN) = NaN, and we still want the keep the largest correlation @@ -332,6 +341,7 @@ class SanityChecker(uid: String = UID[SanityChecker]) val maxRuleConf = $(maxRuleConfidence) val minReqRuleSupport = $(minRequiredRuleSupport) val removeFromParent = $(removeFeatureGroup) + val textSharedHashProtected = $(protectTextSharedHash) // Calculate groups to remove separately. This is for more complicated checks where you can't determine whether // to remove a feature from a single column stats (eg. associate rule confidence/support check) @@ -357,6 +367,7 @@ class SanityChecker(uid: String = UID[SanityChecker]) maxRuleConfidence = maxRuleConf, minRequiredRuleSupport = minReqRuleSupport, removeFeatureGroup = removeFromParent, + protectTextSharedHash = textSharedHashProtected, removedGroups = ruleConfGroupsToDrop ) if reasons.nonEmpty @@ -374,9 +385,8 @@ class SanityChecker(uid: String = UID[SanityChecker]) ): Array[CategoricalGroupStats] = { // Figure out which columns correspond to MultiPickList values so that we can make the "OTHER" columns at most 1 so // that we can still use contingency matrices to calculate Cramer's V values - val multiPickList = FeatureType.shortTypeName[MultiPickList] val multiPickListIndices = columnMeta.zipWithIndex.collect { - case (col, index) if col.hasParentOfType(multiPickList) => index + case (col, index) if col.hasParentOfSubType[MultiPickList] => index }.toSet // Group by label and then add in a 1.0 so we can get the total occurrences for each label in one reduction @@ -410,7 +420,7 @@ class SanityChecker(uid: String = UID[SanityChecker]) .groupBy(_._1) // Keep track of the group, column name, column index, and whether the parent was a MultiPickList or not .map { case (group, cols) => (group, cols.map(_._2.makeColName()), cols.map(_._2.index), - cols.exists(_._2.hasParentOfType(multiPickList))) + cols.exists(_._2.hasParentOfSubType[MultiPickList])) } colIndicesByIndicatorGroup.map { @@ -624,6 +634,7 @@ object SanityChecker { val MaxCramersV = 0.95 val RemoveBadFeatures = false val RemoveFeatureGroup = true + val ProtectTextSharedHash = false val CorrelationType = Pearson // These settings will make the maxRuleConfidence check off by default val MaxRuleConfidence = 1.0 @@ -667,6 +678,8 @@ private[op] case class ColumnStatistics * @param maxCramersV Maximum Cramer's V value * @param maxRuleConfidence Minimum association rule confidence between * @param minRequiredRuleSupport Minimum required support to throw away a group + * @param removeFeatureGroup Whether to remove entire feature group when any group value is flagged for removal + * @param protectTextSharedHash Whether to protect text shared hash from related null indicator and other hashes * @param removedGroups Pre-determined feature groups to remove (eg. via maxRuleConfidence) * @return List[String] if reason to remove, nil otherwise */ @@ -678,6 +691,7 @@ private[op] case class ColumnStatistics maxRuleConfidence: Double, minRequiredRuleSupport: Double, removeFeatureGroup: Boolean, + protectTextSharedHash: Boolean, removedGroups: Seq[String] ): List[String] = { if (isLabel) List() // never remove the label! @@ -707,18 +721,31 @@ private[op] case class ColumnStatistics ).flatten val parentExclusionReasons = - if (removeFeatureGroup) List( - parentCramersV.filter(_ > maxCramersV).map(cv => - s"Cramer's V $cv for something in parent feature set higher than max Cramer's V $maxCramersV"), - parentCorr.filter(_ > maxCorrelation).map(corr => - s"correlation $corr for something in parent feature set higher than max correlation $maxCorrelation") - ).flatten - else List.empty[String] + if (removeFeatureGroup && (!column.forall(isTextSharedHash) || !protectTextSharedHash)) { + List( + parentCramersV.filter(_ > maxCramersV).map(cv => + s"Cramer's V $cv for something in parent feature set higher than max Cramer's V $maxCramersV"), + parentCorr.filter(_ > maxCorrelation).map(corr => + s"correlation $corr for something in parent feature set higher than max correlation $maxCorrelation") + ).flatten + } else List.empty[String] exclusionReasons ++ parentExclusionReasons } } + /** + * Is column a shared hash feature that is derived from Text, TextArea, TextMap, or TextAreaMap + * + * @param metadata metadata of column + * @return + */ + def isTextSharedHash(metadata: OpVectorColumnMetadata): Boolean = { + val isDerivedFromText = metadata.hasParentOfType[Text] || metadata.hasParentOfType[TextArea] || + metadata.hasParentOfType[TextMap] || metadata.hasParentOfType[TextAreaMap] + isDerivedFromText && metadata.indicatorGroup.isEmpty && metadata.indicatorValue.isEmpty + } + override def toString: String = { val description = if (isLabel) "Label" else s"Feature" s"$description $name has: " + @@ -752,3 +779,4 @@ object CorrelationType extends Enum[CorrelationType] { */ case object Spearman extends CorrelationType("spearman") } + diff --git a/core/src/main/scala/com/salesforce/op/stages/impl/regression/OpDecisionTreeRegressor.scala b/core/src/main/scala/com/salesforce/op/stages/impl/regression/OpDecisionTreeRegressor.scala new file mode 100644 index 0000000000..4ce76dd324 --- /dev/null +++ b/core/src/main/scala/com/salesforce/op/stages/impl/regression/OpDecisionTreeRegressor.scala @@ -0,0 +1,120 @@ +/* + * Copyright (c) 2017, Salesforce.com, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of Salesforce.com nor the names of its contributors may + * be used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +package com.salesforce.op.stages.impl.regression + +import com.salesforce.op.UID +import com.salesforce.op.features.types.{OPVector, Prediction, RealNN} +import com.salesforce.op.stages.impl.CheckIsResponseValues +import com.salesforce.op.stages.sparkwrappers.specific.{OpPredictionModel, OpPredictorWrapper} +import com.salesforce.op.utils.reflection.ReflectionUtils.reflectMethod +import org.apache.spark.ml.regression.{DecisionTreeRegressionModel, DecisionTreeRegressor, OpDecisionTreeRegressorParams} + +import scala.reflect.runtime.universe.TypeTag + +/** + * Wrapper for spark Decision Tree Regressor [[org.apache.spark.ml.regression.DecisionTreeRegressor]] + * @param uid stage uid + */ +class OpDecisionTreeRegressor(uid: String = UID[OpDecisionTreeRegressor]) + extends OpPredictorWrapper[DecisionTreeRegressor, DecisionTreeRegressionModel]( + predictor = new DecisionTreeRegressor(), + uid = uid + ) with OpDecisionTreeRegressorParams { + + override protected def onSetInput(): Unit = { + super.onSetInput() + CheckIsResponseValues(in1, in2) + } + + /** @group setParam */ + override def setMaxDepth(value: Int): this.type = set(maxDepth, value) + + /** @group setParam */ + override def setMaxBins(value: Int): this.type = set(maxBins, value) + + /** @group setParam */ + override def setMinInstancesPerNode(value: Int): this.type = set(minInstancesPerNode, value) + + /** @group setParam */ + override def setMinInfoGain(value: Double): this.type = set(minInfoGain, value) + + /** @group expertSetParam */ + override def setMaxMemoryInMB(value: Int): this.type = set(maxMemoryInMB, value) + + /** @group expertSetParam */ + override def setCacheNodeIds(value: Boolean): this.type = set(cacheNodeIds, value) + + /** + * Specifies how often to checkpoint the cached node IDs. + * E.g. 10 means that the cache will get checkpointed every 10 iterations. + * This is only used if cacheNodeIds is true and if the checkpoint directory is set in + * [[org.apache.spark.SparkContext]]. + * Must be at least 1. + * (default = 10) + * @group setParam + */ + override def setCheckpointInterval(value: Int): this.type = set(checkpointInterval, value) + + /** @group setParam */ + override def setImpurity(value: String): this.type = set(impurity, value) + + /** @group setParam */ + override def setSeed(value: Long): this.type = set(seed, value) + + /** @group setParam */ + def setVarianceCol(value: String): this.type = set(varianceCol, value) + +} + +/** + * Class that takes in a spark DecisionTreeRegressionModel and wraps it into an OP model which returns a + * Prediction feature + * @param sparkModel model to wrap + * @param uid uid to give stage + * @param operationName unique name of the operation this stage performs + */ +class OpDecisionTreeRegressionModel +( + sparkModel: DecisionTreeRegressionModel, + uid: String = UID[OpDecisionTreeRegressionModel], + operationName: String = classOf[DecisionTreeRegressor].getSimpleName +)( + implicit tti1: TypeTag[RealNN], + tti2: TypeTag[OPVector], + tto: TypeTag[Prediction], + ttov: TypeTag[Prediction#Value] +) extends OpPredictionModel[DecisionTreeRegressionModel]( + sparkModel = sparkModel, uid = uid, operationName = operationName +) { + @transient lazy val predictMirror = reflectMethod(getSparkMlStage().get, "predict") +} + diff --git a/core/src/main/scala/com/salesforce/op/stages/impl/regression/OpGBTRegressor.scala b/core/src/main/scala/com/salesforce/op/stages/impl/regression/OpGBTRegressor.scala new file mode 100644 index 0000000000..8083b67250 --- /dev/null +++ b/core/src/main/scala/com/salesforce/op/stages/impl/regression/OpGBTRegressor.scala @@ -0,0 +1,146 @@ +/* + * Copyright (c) 2017, Salesforce.com, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of Salesforce.com nor the names of its contributors may + * be used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +package com.salesforce.op.stages.impl.regression + +import com.salesforce.op.UID +import com.salesforce.op.features.types.{OPVector, Prediction, RealNN} +import com.salesforce.op.stages.impl.CheckIsResponseValues +import com.salesforce.op.stages.sparkwrappers.specific.{OpPredictionModel, OpPredictorWrapper} +import com.salesforce.op.utils.reflection.ReflectionUtils.reflectMethod +import org.apache.spark.ml.regression.{GBTRegressionModel, GBTRegressor, OpGBTRegressorParams} + +import scala.reflect.runtime.universe.TypeTag + +/** + * Wrapper for spark GBT Regressor [[org.apache.spark.ml.regression.GBTRegressor]] + * @param uid stage uid + */ +class OpGBTRegressor(uid: String = UID[OpGBTRegressor]) + extends OpPredictorWrapper[GBTRegressor, GBTRegressionModel]( + predictor = new GBTRegressor(), + uid = uid + ) with OpGBTRegressorParams { + + override protected def onSetInput(): Unit = { + super.onSetInput() + CheckIsResponseValues(in1, in2) + } + + // Parameters from TreeRegressorParams: + + /** @group setParam */ + override def setMaxDepth(value: Int): this.type = set(maxDepth, value) + + /** @group setParam */ + override def setMaxBins(value: Int): this.type = set(maxBins, value) + + /** @group setParam */ + override def setMinInstancesPerNode(value: Int): this.type = set(minInstancesPerNode, value) + + /** @group setParam */ + override def setMinInfoGain(value: Double): this.type = set(minInfoGain, value) + + /** @group expertSetParam */ + override def setMaxMemoryInMB(value: Int): this.type = set(maxMemoryInMB, value) + + /** @group expertSetParam */ + override def setCacheNodeIds(value: Boolean): this.type = set(cacheNodeIds, value) + + /** + * Specifies how often to checkpoint the cached node IDs. + * E.g. 10 means that the cache will get checkpointed every 10 iterations. + * This is only used if cacheNodeIds is true and if the checkpoint directory is set in + * [[org.apache.spark.SparkContext]]. + * Must be at least 1. + * (default = 10) + * @group setParam + */ + override def setCheckpointInterval(value: Int): this.type = set(checkpointInterval, value) + + /** + * The impurity setting is ignored for GBT models. + * Individual trees are built using impurity "Variance." + * + * @group setParam + */ + override def setImpurity(value: String): this.type = { + logWarning("GBTRegressor.setImpurity should NOT be used") + this + } + + // Parameters from TreeEnsembleParams: + + /** @group setParam */ + override def setSubsamplingRate(value: Double): this.type = set(subsamplingRate, value) + + /** @group setParam */ + override def setSeed(value: Long): this.type = set(seed, value) + + // Parameters from GBTParams: + + /** @group setParam */ + override def setMaxIter(value: Int): this.type = set(maxIter, value) + + /** @group setParam */ + override def setStepSize(value: Double): this.type = set(stepSize, value) + + // Parameters from GBTRegressorParams: + + /** @group setParam */ + def setLossType(value: String): this.type = set(lossType, value) +} + + +/** + * Class that takes in a spark GBTRegressionModel and wraps it into an OP model which returns a + * Prediction feature + * + * @param sparkModel model to wrap + * @param uid uid to give stage + * @param operationName unique name of the operation this stage performs + */ +class OpGBTRegressionModel +( + sparkModel: GBTRegressionModel, + uid: String = UID[OpGBTRegressionModel], + operationName: String = classOf[GBTRegressor].getSimpleName +)( + implicit tti1: TypeTag[RealNN], + tti2: TypeTag[OPVector], + tto: TypeTag[Prediction], + ttov: TypeTag[Prediction#Value] +) extends OpPredictionModel[GBTRegressionModel]( + sparkModel = sparkModel, uid = uid, operationName = operationName +) { + @transient lazy val predictMirror = reflectMethod(getSparkMlStage().get, "predict") +} + diff --git a/core/src/main/scala/com/salesforce/op/stages/impl/regression/OpGeneralizedLinearRegression.scala b/core/src/main/scala/com/salesforce/op/stages/impl/regression/OpGeneralizedLinearRegression.scala new file mode 100644 index 0000000000..5f13d5bd45 --- /dev/null +++ b/core/src/main/scala/com/salesforce/op/stages/impl/regression/OpGeneralizedLinearRegression.scala @@ -0,0 +1,200 @@ +/* + * Copyright (c) 2017, Salesforce.com, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of Salesforce.com nor the names of its contributors may + * be used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +package com.salesforce.op.stages.impl.regression + +import com.salesforce.op.UID +import com.salesforce.op.features.types.{OPVector, Prediction, RealNN} +import com.salesforce.op.stages.impl.CheckIsResponseValues +import com.salesforce.op.stages.sparkwrappers.specific.{OpPredictorWrapper, OpPredictorWrapperModel} +import com.salesforce.op.utils.reflection.ReflectionUtils.reflectMethod +import org.apache.spark.ml.regression.{GeneralizedLinearRegression, GeneralizedLinearRegressionModel, OpGeneralizedLinearRegressionParams} + +import scala.reflect.runtime.universe.TypeTag + +/** + * Wrapper for spark Generalized Regression [[org.apache.spark.ml.regression.GeneralizedLinearRegression]] + * @param uid stage uid + */ +class OpGeneralizedLinearRegression(uid: String = UID[OpGeneralizedLinearRegression]) + extends OpPredictorWrapper[GeneralizedLinearRegression, GeneralizedLinearRegressionModel]( + predictor = new GeneralizedLinearRegression(), + uid = uid + ) with OpGeneralizedLinearRegressionParams { + + override protected def onSetInput(): Unit = { + super.onSetInput() + CheckIsResponseValues(in1, in2) + } + + /** + * Sets the value of param [[family]]. + * Default is "gaussian". + * + * @group setParam + */ + def setFamily(value: String): this.type = set(family, value) + setDefault(family -> "gaussian") + + /** + * Sets the value of param [[variancePower]]. + * Used only when family is "tweedie". + * Default is 0.0, which corresponds to the "gaussian" family. + * + * @group setParam + */ + def setVariancePower(value: Double): this.type = set(variancePower, value) + setDefault(variancePower -> 0.0) + + /** + * Sets the value of param [[linkPower]]. + * Used only when family is "tweedie". + * + * @group setParam + */ + def setLinkPower(value: Double): this.type = set(linkPower, value) + + /** + * Sets the value of param [[link]]. + * Used only when family is not "tweedie". + * + * @group setParam + */ + def setLink(value: String): this.type = set(link, value) + + /** + * Sets if we should fit the intercept. + * Default is true. + * + * @group setParam + */ + def setFitIntercept(value: Boolean): this.type = set(fitIntercept, value) + + /** + * Sets the maximum number of iterations (applicable for solver "irls"). + * Default is 25. + * + * @group setParam + */ + def setMaxIter(value: Int): this.type = set(maxIter, value) + setDefault(maxIter -> 25) + + /** + * Sets the convergence tolerance of iterations. + * Smaller value will lead to higher accuracy with the cost of more iterations. + * Default is 1E-6. + * + * @group setParam + */ + def setTol(value: Double): this.type = set(tol, value) + setDefault(tol -> 1E-6) + + /** + * Sets the regularization parameter for L2 regularization. + * The regularization term is + *
+ * $$ + * 0.5 * regParam * L2norm(coefficients)^2 + * $$ + *+ * Default is 0.0. + * + * @group setParam + */ + def setRegParam(value: Double): this.type = set(regParam, value) + setDefault(regParam -> 0.0) + + /** + * Sets the value of param [[weightCol]]. + * If this is not set or empty, we treat all instance weights as 1.0. + * Default is not set, so all instances have weight one. + * In the Binomial family, weights correspond to number of trials and should be integer. + * Non-integer weights are rounded to integer in AIC calculation. + * + * @group setParam + */ + def setWeightCol(value: String): this.type = set(weightCol, value) + + /** + * Sets the solver algorithm used for optimization. + * Currently only supports "irls" which is also the default solver. + * + * @group setParam + */ + def setSolver(value: String): this.type = set(solver, value) + setDefault(solver -> "irls") + + /** + * Sets the link prediction (linear predictor) column name. + * + * @group setParam + */ + def setLinkPredictionCol(value: String): this.type = set(linkPredictionCol, value) + +} + + + +/** + * Class that takes in a spark GeneralizedLinearRegressionModel and wraps it into an OP model which returns a + * Prediction feature + * + * @param sparkModel model to wrap + * @param uid uid to give stage + * @param operationName unique name of the operation this stage performs + */ +class OpGeneralizedLinearRegressionModel +( + sparkModel: GeneralizedLinearRegressionModel, + uid: String = UID[GeneralizedLinearRegressionModel], + operationName: String = classOf[GeneralizedLinearRegression].getSimpleName +)( + implicit tti1: TypeTag[RealNN], + tti2: TypeTag[OPVector], + tto: TypeTag[Prediction], + ttov: TypeTag[Prediction#Value] +) extends OpPredictorWrapperModel[GeneralizedLinearRegressionModel](uid = uid, operationName = operationName, + sparkModel = sparkModel) { + + @transient lazy private val predictLink = reflectMethod(getSparkMlStage().get, "predictLink") + @transient lazy private val predict = reflectMethod(getSparkMlStage().get, "predict") + + /** + * Function used to convert input to output + */ + override def transformFn: (RealNN, OPVector) => Prediction = (label, features) => { + val raw = predictLink.apply(features.value).asInstanceOf[Double] + val pred = predict.apply(features.value).asInstanceOf[Double] + Prediction(prediction = pred, rawPrediction = raw) + } +} + + diff --git a/core/src/main/scala/com/salesforce/op/stages/impl/regression/OpLinearRegression.scala b/core/src/main/scala/com/salesforce/op/stages/impl/regression/OpLinearRegression.scala index e05bd15075..23fc7d888c 100644 --- a/core/src/main/scala/com/salesforce/op/stages/impl/regression/OpLinearRegression.scala +++ b/core/src/main/scala/com/salesforce/op/stages/impl/regression/OpLinearRegression.scala @@ -32,19 +32,23 @@ package com.salesforce.op.stages.impl.regression import com.salesforce.op._ -import com.salesforce.op.features.types._ +import com.salesforce.op.features.types.{OPVector, Prediction, RealNN} import com.salesforce.op.stages.impl.CheckIsResponseValues -import com.salesforce.op.stages.sparkwrappers.specific.OpPredictorWrapper -import org.apache.spark.ml.regression.{LinearRegression, LinearRegressionModel} +import com.salesforce.op.stages.sparkwrappers.specific.{OpPredictionModel, OpPredictorWrapper} +import com.salesforce.op.utils.reflection.ReflectionUtils.reflectMethod +import org.apache.spark.ml.regression.{LinearRegression, LinearRegressionModel, OpLinearRegressionParams} + +import scala.reflect.runtime.universe.TypeTag + /** - * Wrapper around spark ml linear regression for use with OP pipelines + * Wrapper around spark ml linear regression [[org.apache.spark.ml.regression.LinearRegression]] */ class OpLinearRegression(uid: String = UID[OpLinearRegression]) - extends OpPredictorWrapper[RealNN, RealNN, LinearRegression, LinearRegressionModel]( + extends OpPredictorWrapper[LinearRegression, LinearRegressionModel]( predictor = new LinearRegression(), uid = uid -){ +) with OpLinearRegressionParams { override protected def onSetInput(): Unit = { super.onSetInput() @@ -57,161 +61,127 @@ class OpLinearRegression(uid: String = UID[OpLinearRegression]) * * @group setParam */ - def setRegParam(value: Double): this.type = { - getSparkStage.setRegParam(value) - this - } + def setRegParam(value: Double): this.type = set(regParam, value) + setDefault(regParam -> 0.0) /** - * Set the ElasticNet mixing parameter. - * For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty. - * For 0 < alpha < 1, the penalty is a combination of L1 and L2. - * Default is 0.0 which is an L2 penalty. + * Set if we should fit the intercept. + * Default is true. * * @group setParam */ - def setElasticNetParam(value: Double): this.type = { - getSparkStage.setElasticNetParam(value) - this - } + def setFitIntercept(value: Boolean): this.type = set(fitIntercept, value) + setDefault(fitIntercept -> true) /** - * Set the maximum number of iterations. - * Default is 100. + * Whether to standardize the training features before fitting the model. + * The coefficients of models will be always returned on the original scale, + * so it will be transparent for users. + * Default is true. + * + * @note With/without standardization, the models should be always converged + * to the same solution when no regularization is applied. In R's GLMNET package, + * the default behavior is true as well. * * @group setParam */ - def setMaxIter(value: Int): this.type = { - getSparkStage.setMaxIter(value) - this - } + def setStandardization(value: Boolean): this.type = set(standardization, value) + setDefault(standardization -> true) /** - * Set the convergence tolerance of iterations. - * Smaller value will lead to higher accuracy with the cost of more iterations. - * Default is 1E-6. + * Set the ElasticNet mixing parameter. + * For alpha = 0, the penalty is an L2 penalty. + * For alpha = 1, it is an L1 penalty. + * For alpha in (0,1), the penalty is a combination of L1 and L2. + * Default is 0.0 which is an L2 penalty. * * @group setParam */ - def setTol(value: Double): this.type = { - getSparkStage.setTol(value) - this - } + def setElasticNetParam(value: Double): this.type = set(elasticNetParam, value) + setDefault(elasticNetParam -> 0.0) /** - * Whether to fit an intercept term. - * Default is true. + * Set the maximum number of iterations. + * Default is 100. * * @group setParam */ - def setFitIntercept(value: Boolean): this.type = { - getSparkStage.setFitIntercept(value) - this - } + def setMaxIter(value: Int): this.type = set(maxIter, value) + setDefault(maxIter -> 100) /** - * Whether to standardize the training features before fitting the model. - * The coefficients of models will be always returned on the original scale, - * so it will be transparent for users. Note that with/without standardization, - * the models should be always converged to the same solution when no regularization - * is applied. In R's GLMNET package, the default behavior is true as well. - * Default is true. + * Set the convergence tolerance of iterations. + * Smaller value will lead to higher accuracy with the cost of more iterations. + * Default is 1E-6. * * @group setParam */ - def setStandardization(value: Boolean): this.type = { - getSparkStage.setStandardization(value) - this - } + def setTol(value: Double): this.type = set(tol, value) + setDefault(tol -> 1E-6) /** * Whether to over-/under-sample training instances according to the given weights in weightCol. - * If not set or empty String, all instances are treated equally (weight 1.0). + * If not set or empty, all instances are treated equally (weight 1.0). * Default is not set, so all instances have weight one. * * @group setParam */ - def setWeightCol(value: String): this.type = { - getSparkStage.setWeightCol(value) - this - } + def setWeightCol(value: String): this.type = set(weightCol, value) /** - * Set the solver algorithm used for optimization. In case of linear regression, this can be "l-bfgs", "normal" and - * "auto". - * "l-bfgs": Limited-memory BFGS which is a limited-memory quasi-Newton optimization method. - * "normal": Normal Equation as an analytical solution to the linear regression problem. - * "auto" (default): solver algorithm is selected automatically. The Normal Equations solver will be used when - * possible, but this will automatically fall back to iterative optimization methods when needed. + * Set the solver algorithm used for optimization. + * In case of linear regression, this can be "l-bfgs", "normal" and "auto". + * - "l-bfgs" denotes Limited-memory BFGS which is a limited-memory quasi-Newton + * optimization method. + * - "normal" denotes using Normal Equation as an analytical solution to the linear regression + * problem. This solver is limited to `LinearRegression.MAX_FEATURES_FOR_NORMAL_SOLVER`. + * - "auto" (default) means that the solver algorithm is selected automatically. + * The Normal Equations solver will be used when possible, but this will automatically fall + * back to iterative optimization methods when needed. * * @group setParam */ def setSolver(value: String): this.type = { - getSparkStage.setSolver(value) - this + require(Set("auto", "l-bfgs", "normal").contains(value), + s"Solver $value was not supported. Supported options: auto, l-bfgs, normal") + set(solver, value) } + setDefault(solver -> "auto") /** - * Get the regularization parameter. + * Suggested depth for treeAggregate (greater than or equal to 2). + * If the dimensions of features or the number of partitions are large, + * this param could be adjusted to a larger size. + * Default is 2. * + * @group expertSetParam */ - def getRegParam: Double = { - getSparkStage.getRegParam - } + def setAggregationDepth(value: Int): this.type = set(aggregationDepth, value) + setDefault(aggregationDepth -> 2) - /** - * Get the ElasticNet mixing parameter. - * - */ - def getElasticNetParam: Double = { - getSparkStage.getElasticNetParam - } - - /** - * Get the maximum number of iterations. - * - */ - def getMaxIter: Int = { - getSparkStage.getMaxIter - } - - /** - * Get the convergence tolerance of iterations. - * - */ - def getTol: Double = { - getSparkStage.getTol - } - - /** - * Get the fit intercept boolean parameter - * - */ - def getFitIntercept: Boolean = { - getSparkStage.getFitIntercept - } +} - /** - * Get the standardization boolean parameter - * - */ - def getStandardization: Boolean = { - getSparkStage.getStandardization - } - /** - * Get the weights in weightCol defining whether to over-/under-sample training instances - * - */ - def getWeightCol: String = { - getSparkStage.getWeightCol - } - - /** - * Get the solver algorithm used for optimization - * - */ - def getSolver: String = { - getSparkStage.getSolver - } +/** + * Class that takes in a spark LinearRegressionModel and wraps it into an OP model which returns a + * Prediction feature + * @param sparkModel model to wrap + * @param uid uid to give stage + * @param operationName unique name of the operation this stage performs + */ +class OpLinearRegressionModel +( + sparkModel: LinearRegressionModel, + uid: String = UID[OpLinearRegressionModel], + operationName: String = classOf[LinearRegression].getSimpleName +)( + implicit tti1: TypeTag[RealNN], + tti2: TypeTag[OPVector], + tto: TypeTag[Prediction], + ttov: TypeTag[Prediction#Value] +) extends OpPredictionModel[LinearRegressionModel]( + sparkModel = sparkModel, uid = uid, operationName = operationName +) { + @transient lazy val predictMirror = reflectMethod(getSparkMlStage().get, "predict") } + diff --git a/core/src/main/scala/com/salesforce/op/stages/impl/regression/OpRandomForestRegressor.scala b/core/src/main/scala/com/salesforce/op/stages/impl/regression/OpRandomForestRegressor.scala new file mode 100644 index 0000000000..7f92aaa478 --- /dev/null +++ b/core/src/main/scala/com/salesforce/op/stages/impl/regression/OpRandomForestRegressor.scala @@ -0,0 +1,134 @@ +/* + * Copyright (c) 2017, Salesforce.com, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of Salesforce.com nor the names of its contributors may + * be used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +package com.salesforce.op.stages.impl.regression + +import com.salesforce.op.UID +import com.salesforce.op.features.types.{OPVector, Prediction, RealNN} +import com.salesforce.op.stages.impl.CheckIsResponseValues +import com.salesforce.op.stages.sparkwrappers.specific.{OpPredictionModel, OpPredictorWrapper} +import com.salesforce.op.utils.reflection.ReflectionUtils.reflectMethod +import org.apache.spark.ml.regression.{OpRandomForestRegressorParams, RandomForestRegressionModel, RandomForestRegressor} + +import scala.reflect.runtime.universe.TypeTag + +/** + * Wrapper around sparj Random Forest Regressor [[org.apache.spark.ml.regression.RandomForestRegressor]] + * @param uid stage uid + */ +class OpRandomForestRegressor(uid: String = UID[OpRandomForestRegressor]) + extends OpPredictorWrapper[RandomForestRegressor, RandomForestRegressionModel]( + predictor = new RandomForestRegressor(), + uid = uid + ) with OpRandomForestRegressorParams { + + override protected def onSetInput(): Unit = { + super.onSetInput() + CheckIsResponseValues(in1, in2) + } + + // Parameters from TreeRegressorParams: + + /** @group setParam */ + override def setMaxDepth(value: Int): this.type = set(maxDepth, value) + + /** @group setParam */ + override def setMaxBins(value: Int): this.type = set(maxBins, value) + + /** @group setParam */ + override def setMinInstancesPerNode(value: Int): this.type = set(minInstancesPerNode, value) + + /** @group setParam */ + override def setMinInfoGain(value: Double): this.type = set(minInfoGain, value) + + /** @group expertSetParam */ + override def setMaxMemoryInMB(value: Int): this.type = set(maxMemoryInMB, value) + + /** @group expertSetParam */ + override def setCacheNodeIds(value: Boolean): this.type = set(cacheNodeIds, value) + + /** + * Specifies how often to checkpoint the cached node IDs. + * E.g. 10 means that the cache will get checkpointed every 10 iterations. + * This is only used if cacheNodeIds is true and if the checkpoint directory is set in + * [[org.apache.spark.SparkContext]]. + * Must be at least 1. + * (default = 10) + * @group setParam + */ + override def setCheckpointInterval(value: Int): this.type = set(checkpointInterval, value) + + /** @group setParam */ + override def setImpurity(value: String): this.type = set(impurity, value) + + // Parameters from TreeEnsembleParams: + + /** @group setParam */ + override def setSubsamplingRate(value: Double): this.type = set(subsamplingRate, value) + + /** @group setParam */ + override def setSeed(value: Long): this.type = set(seed, value) + + // Parameters from RandomForestParams: + + /** @group setParam */ + override def setNumTrees(value: Int): this.type = set(numTrees, value) + + /** @group setParam */ + override def setFeatureSubsetStrategy(value: String): this.type = + set(featureSubsetStrategy, value) + +} + +/** + * Class that takes in a spark RandomForestRegressionModel and wraps it into an OP model which returns a + * Prediction feature + * @param sparkModel model to wrap + * @param uid uid to give stage + * @param operationName unique name of the operation this stage performs + */ +class OpRandomForestRegressionModel +( + sparkModel: RandomForestRegressionModel, + uid: String = UID[OpRandomForestRegressionModel], + operationName: String = classOf[RandomForestRegressor].getSimpleName +)( + implicit tti1: TypeTag[RealNN], + tti2: TypeTag[OPVector], + tto: TypeTag[Prediction], + ttov: TypeTag[Prediction#Value] +) extends OpPredictionModel[RandomForestRegressionModel]( + sparkModel = sparkModel, uid = uid, operationName = operationName +) { + @transient lazy val predictMirror = reflectMethod(getSparkMlStage().get, "predict") +} + + diff --git a/core/src/main/scala/com/salesforce/op/stages/impl/selector/ModelSelectorBase.scala b/core/src/main/scala/com/salesforce/op/stages/impl/selector/ModelSelectorBase.scala index 5344f769ad..94a6899954 100644 --- a/core/src/main/scala/com/salesforce/op/stages/impl/selector/ModelSelectorBase.scala +++ b/core/src/main/scala/com/salesforce/op/stages/impl/selector/ModelSelectorBase.scala @@ -31,6 +31,7 @@ package com.salesforce.op.stages.impl.selector +import com.salesforce.op.utils.stages.FitStagesUtil._ import com.salesforce.op.UID import com.salesforce.op.utils.spark.RichDataset._ import com.salesforce.op.evaluators.{EvaluationMetrics, _} @@ -39,15 +40,14 @@ import com.salesforce.op.features.types._ import com.salesforce.op.readers.DataFrameFieldNames import com.salesforce.op.stages._ import com.salesforce.op.stages.impl.CheckIsResponseValues -import com.salesforce.op.stages.impl.tuning.SelectorData.LabelFeaturesKey import com.salesforce.op.stages.impl.tuning._ import com.salesforce.op.stages.sparkwrappers.generic.SparkWrapperParams import com.salesforce.op.utils.spark.RichMetadata._ import org.apache.spark.ml.param._ import org.apache.spark.ml.{Estimator, Model, Transformer} import org.apache.spark.sql.functions._ -import org.apache.spark.sql.types.MetadataBuilder -import org.apache.spark.sql.{DataFrame, Dataset} +import org.apache.spark.sql.types.{MetadataBuilder, StructType} +import org.apache.spark.sql.{DataFrame, Dataset, SparkSession} import scala.reflect.runtime.universe._ import scala.util.Try @@ -191,6 +191,34 @@ private[op] abstract class ModelSelectorBase[M <: Model[_], E <: Estimator[_]] */ protected def getModelInfo: Seq[ModelInfo[E]] + /** + * Get the list of all the models and their parameters for comparison + * @return value + */ + protected[op] def getUsedModels: Seq[ModelInfo[E]] = getModelInfo.filter(m => $(m.useModel)) + + /** + * Find best estimator with validation on a workflow level. Executed when workflow level Cross Validation is on + * (see [[com.salesforce.op.OpWorkflow.withWorkflowCV]]) + * + * @param data data to validate + * @param dag dag done inside the Cross-validation/Train-validation split + * @param persistEveryKStages frequency of persisting the DAG's stages + * @param spark Spark Session + * @return Updated Model Selector with best model along with best paramMap + */ + protected[op] def findBestEstimator(data: Dataset[_], dag: StagesDAG, persistEveryKStages: Int = 0) + (implicit spark: SparkSession): Unit = { + + val theBestEstimator = validator.validate(modelInfo = getUsedModels, dataset = data, + label = in1.name, features = in2.name, dag = Option(dag), splitter = splitter, + stratifyCondition = validator.isClassification + ) + + bestEstimator = Option(theBestEstimator) + } + + // Map (name of param, value of param) of output column names def outputsColNamesMap: Map[String, String] = { val defaultNames = getOutputsColNamesMap(in1, in2) @@ -216,16 +244,15 @@ private[op] abstract class ModelSelectorBase[M <: Model[_], E <: Estimator[_]] */ final override def fit(dataset: Dataset[_]): SelectedModel = { - import dataset.sparkSession.implicits._ + implicit val spark = dataset.sparkSession + import spark.implicits._ val datasetWithID = if (dataset.columns.contains(DataFrameFieldNames.KeyFieldName)) { dataset.select(in1.name, in2.name, DataFrameFieldNames.KeyFieldName) - .as[LabelFeaturesKey].persist() } else { dataset.select(in1.name, in2.name) .withColumn(ModelSelectorBaseNames.idColName, monotonically_increasing_id()) - .as[LabelFeaturesKey].persist() } require(!datasetWithID.isEmpty, "Dataset cannot be empty") @@ -234,13 +261,19 @@ private[op] abstract class ModelSelectorBase[M <: Model[_], E <: Estimator[_]] case None => new ModelData(datasetWithID, new MetadataBuilder()) } - - val bestModel = bestEstimator.map { case BestEstimator(name, estimator, meta) => - new BestModel(name = name, model = estimator.fit(trainData).asInstanceOf[M], metadata = Option(meta)) - }.getOrElse { + val BestEstimator(name, estimator, meta) = bestEstimator.getOrElse{ setInputSchema(dataset.schema).transformSchema(dataset.schema) - validator.validate(getModelInfo.filter(m => $(m.useModel)), trainData, in1.name, in2.name) + val best = validator + .validate(modelInfo = getUsedModels, dataset = trainData, label = in1.name, features = in2.name) + bestEstimator = Some(best) + best } + + val bestModel = new BestModel( + name = name, + model = estimator.fit(trainData).asInstanceOf[M], + metadata = Option(meta) + ) bestModel.metadata.foreach(meta => setMetadata(meta.build)) val bestClassifier = bestModel.model.parent log.info(s"Selected model : ${bestClassifier.getClass.getSimpleName}") diff --git a/core/src/main/scala/com/salesforce/op/stages/impl/tuning/DataBalancer.scala b/core/src/main/scala/com/salesforce/op/stages/impl/tuning/DataBalancer.scala index db6cf874c8..8a1b68f84a 100644 --- a/core/src/main/scala/com/salesforce/op/stages/impl/tuning/DataBalancer.scala +++ b/core/src/main/scala/com/salesforce/op/stages/impl/tuning/DataBalancer.scala @@ -33,9 +33,8 @@ package com.salesforce.op.stages.impl.tuning import com.salesforce.op.UID import com.salesforce.op.stages.impl.selector.ModelSelectorBaseNames -import com.salesforce.op.stages.impl.tuning.SelectorData.LabelFeaturesKey import org.apache.spark.ml.param._ -import org.apache.spark.sql.Dataset +import org.apache.spark.sql.{DataFrame, Dataset, Row} import org.apache.spark.sql.types.MetadataBuilder import org.slf4j.LoggerFactory @@ -74,6 +73,7 @@ case object DataBalancer { class DataBalancer(uid: String = UID[DataBalancer]) extends Splitter(uid = uid) with DataBalancerParams { @transient private lazy val log = LoggerFactory.getLogger(this.getClass) + @transient private[op] val metadataBuilder = new MetadataBuilder() /** * Computes the upSample and downSample proportions. @@ -120,44 +120,23 @@ class DataBalancer(uid: String = UID[DataBalancer]) extends Splitter(uid = uid) /** * Split into a training set and a test set and balance the training set * - * @param data to prepare for model training + * @param data to prepare for model training. first column must be the label as a double * @return balanced training set and a test set */ - def prepare(data: Dataset[LabelFeaturesKey]): ModelData = { + def prepare(data: Dataset[Row]): ModelData = { val ds = data.persist() - val Array(negativeData, positiveData) = Array(0.0, 1.0).map(label => ds.filter(_._1 == label).persist()) - val metadataBuilder = new MetadataBuilder() + val Array(negativeData, positiveData) = Array(0.0, 1.0).map(label => ds.filter(_.getDouble(0) == label).persist()) val balancerSeed = getSeed - // If these conditions are met, that means that we have enough information to balance the data : upSample, - // downSample and which class is in minority - if (isSet(isPositiveSmall) && isSet(downSampleFraction) && isSet(upSampleFraction)) { - val (down, up) = ($(downSampleFraction), $(upSampleFraction)) - log.info(s"Fractions are already known : downSample of ${down}, upSample of ${up}") - val (smallData, bigData) = if ($(isPositiveSmall)) (positiveData, negativeData) else (negativeData, positiveData) - new ModelData(rebalance(smallData, up, bigData, down, balancerSeed), metadataBuilder) - // If this condition is met, that means that the data is already balanced, but need to be sampled - } else if (isSet(alreadyBalancedFraction)) { - val f = $(alreadyBalancedFraction) - log.info(s"Data is already balanced, yet it will be sampled by a fraction of $f") - new ModelData(sampleBalancedData( - fraction = f, - seed = balancerSeed, - data = data, - positiveData = positiveData, - negativeData = negativeData), - metadataBuilder - ) - // Usual estimation by computing the sizes of the data - } else estimateAndBalance( - data = data, + prepareData( + data = ds, positiveData = positiveData, negativeData = negativeData, - metadataBuilder = metadataBuilder, seed = balancerSeed ) + } override def copy(extra: ParamMap): DataBalancer = { @@ -165,67 +144,55 @@ class DataBalancer(uid: String = UID[DataBalancer]) extends Splitter(uid = uid) copyValues(copy, extra) } + + /** - * Estimate if data needs to be balanced or not. If so, computes sample fractions and balance data + * Estimate if data needs to be balanced or not. If so, computes sample fractions and sets the appropriate params * * @param data input data * @param positiveData data with positives only * @param negativeData data with negatives only - * @param metadataBuilder metadata * @param seed seed * @return balanced data */ - private[op] def estimateAndBalance( - data: Dataset[LabelFeaturesKey], - positiveData: Dataset[LabelFeaturesKey], - negativeData: Dataset[LabelFeaturesKey], - metadataBuilder: MetadataBuilder, + private[op] def estimate[T]( + data: Dataset[T], + positiveData: Dataset[T], + negativeData: Dataset[T], seed: Long - ): ModelData = { + ): Unit = { val positiveCount = positiveData.count() val negativeCount = negativeData.count() val totalCount = positiveCount + negativeCount + val sampleF = getSampleFraction // feed metadata with counts and sample fraction metadataBuilder.putLong(ModelSelectorBaseNames.Positive, positiveCount) metadataBuilder.putLong(ModelSelectorBaseNames.Negative, negativeCount) - metadataBuilder.putDouble(ModelSelectorBaseNames.Desired, $(sampleFraction)) + metadataBuilder.putDouble(ModelSelectorBaseNames.Desired, sampleF) log.info(s"Data has $positiveCount positive and $negativeCount negative.") - val (smallCount, smallData, bigCount, bigData) = { + val (smallCount, bigCount) = { val isPosSmall = positiveCount < negativeCount setIsPositiveSmall(isPosSmall) - if (isPosSmall) (positiveCount, positiveData, negativeCount, negativeData) - else (negativeCount, negativeData, positiveCount, positiveData) + if (isPosSmall) (positiveCount, negativeCount) + else (negativeCount, positiveCount) } val maxTrainSample = getMaxTrainingSample if (smallCount < 100 || (smallCount + bigCount) < 500) { log.warn("!!!Attention!!! - there is not enough data to build a good model!") } - val sampleF = getSampleFraction - // if the current fraction is superior than the one expected if (smallCount.toDouble / totalCount.toDouble >= sampleF) { log.info( s"Not resampling data: $smallCount small count and $bigCount big count is greater than" + s" requested ${sampleF}" ) - // if data is too big downsample val fraction = if (maxTrainSample < totalCount) maxTrainSample / totalCount.toDouble else 1.0 - setAlreadyBalancedFraction(fraction) - // sample - new ModelData(sampleBalancedData( - fraction = fraction, - seed = seed, - data = data, - positiveData = positiveData, - negativeData = negativeData - ), - metadataBuilder) } else { log.info(s"Sampling data to get $sampleF split versus $smallCount small and $bigCount big") val (downSample, upSample) = getProportions(smallCount, bigCount, sampleF, maxTrainSample) @@ -257,7 +224,42 @@ class DataBalancer(uid: String = UID[DataBalancer]) extends Splitter(uid = uid) s"To make upsampling happen, please increase the max training sample size '${maxTrainingSample.name}'") } - new ModelData(rebalance(smallData, upSample, bigData, downSample, seed), metadataBuilder) + } + } + /** + * Preparing data + * + * @param data input data + * @param positiveData data with positives only + * @param negativeData data with negatives only + * @param seed seed + * @return balanced data + */ + private[op] def prepareData[T]( + data: Dataset[T], + positiveData: Dataset[T], + negativeData: Dataset[T], + seed: Long + ): ModelData = { + + if (!(isSet(isPositiveSmall) || isSet(downSampleFraction) || + isSet(upSampleFraction) || isSet(alreadyBalancedFraction))) { + estimate(data = data, positiveData = positiveData, negativeData = negativeData, seed = seed) + } + + // If these conditions are met, that means that we have enough information to balance the data : upSample, + // downSample and which class is in minority + if (isSet(isPositiveSmall) && isSet(downSampleFraction) && isSet(upSampleFraction)) { + val (down, up) = ($(downSampleFraction), $(upSampleFraction)) + log.info(s"Sample fractions: downSample of ${down}, upSample of ${up}") + val (smallData, bigData) = if ($(isPositiveSmall)) (positiveData, negativeData) else (negativeData, positiveData) + new ModelData(rebalance(smallData, up, bigData, down, seed).toDF(), metadataBuilder) + } else { // Data is already balanced, but need to be sampled + val fraction = $(alreadyBalancedFraction) + log.info(s"Data is already balanced, yet it will be sampled by a fraction of $fraction") + val balanced = sampleBalancedData(fraction = fraction, seed = seed, + data = data, positiveData = positiveData, negativeData = negativeData).toDF() + new ModelData(balanced, metadataBuilder) } } @@ -272,13 +274,13 @@ class DataBalancer(uid: String = UID[DataBalancer]) extends Splitter(uid = uid) * @return balanced small and big data split into training and test sets * with downSample and upSample proportions */ - private[op] def rebalance( - smallData: Dataset[_], + private[op] def rebalance[T]( + smallData: Dataset[T], upSampleFraction: Double, - bigData: Dataset[_], + bigData: Dataset[T], downSampleFraction: Double, seed: Long - ): Dataset[LabelFeaturesKey] = { + ): Dataset[T] = { import smallData.sparkSession.implicits._ val bigDataTrain = bigData.sample(withReplacement = false, downSampleFraction, seed = seed) @@ -288,27 +290,26 @@ class DataBalancer(uid: String = UID[DataBalancer]) extends Splitter(uid = uid) case u => smallData.sample(withReplacement = false, u, seed = seed) // downsample instead } - smallDataTrain.as[LabelFeaturesKey].union(bigDataTrain.as[LabelFeaturesKey]) - + smallDataTrain.union(bigDataTrain) } /** * Sample already balanced data * - * @param fraction - * @param seed - * @param data - * @param positiveData - * @param negativeData + * @param fraction subsample to take + * @param seed seed to use in sampling + * @param data full dataset in case no sampling is needed + * @param positiveData positive data for stratified sampling + * @param negativeData negative data for stratified sampling * @return */ - private[op] def sampleBalancedData( + private[op] def sampleBalancedData[T]( fraction: Double, seed: Long, - data: Dataset[LabelFeaturesKey], - positiveData: Dataset[LabelFeaturesKey], - negativeData: Dataset[LabelFeaturesKey] - ): Dataset[LabelFeaturesKey] = { + data: Dataset[T], + positiveData: Dataset[T], + negativeData: Dataset[T] + ): Dataset[T] = { fraction match { case 1.0 => data // we don't sample // stratified sampling diff --git a/core/src/main/scala/com/salesforce/op/stages/impl/tuning/DataCutter.scala b/core/src/main/scala/com/salesforce/op/stages/impl/tuning/DataCutter.scala index 5762fa0277..6c9fb97b06 100644 --- a/core/src/main/scala/com/salesforce/op/stages/impl/tuning/DataCutter.scala +++ b/core/src/main/scala/com/salesforce/op/stages/impl/tuning/DataCutter.scala @@ -33,11 +33,10 @@ package com.salesforce.op.stages.impl.tuning import com.salesforce.op.UID import com.salesforce.op.stages.impl.selector.ModelSelectorBaseNames -import com.salesforce.op.stages.impl.tuning.SelectorData.LabelFeaturesKey import org.apache.spark.ml.param._ import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.{Metadata, MetadataBuilder} -import org.apache.spark.sql.{DataFrame, Dataset} +import org.apache.spark.sql.{DataFrame, Dataset, Row} import org.slf4j.LoggerFactory case object DataCutter { @@ -81,15 +80,15 @@ class DataCutter(uid: String = UID[DataCutter]) extends Splitter(uid = uid) with * function to use to prepare the dataset for modeling * eg - do data balancing or dropping based on the labels * - * @param data + * @param data first column must be the label as a double * @return Training set test set */ - def prepare(data: Dataset[LabelFeaturesKey]): ModelData = { + def prepare(data: Dataset[Row]): ModelData = { import data.sparkSession.implicits._ val keep = if (!isSet(labelsToKeep) || !isSet(labelsToDrop)) { - val labels = data.map(r => r._1 -> 1L) + val labels = data.map(r => r.getDouble(0) -> 1L) val labelCounts = labels.groupBy(labels.columns(0)).sum(labels.columns(1)).persist() val (resKeep, resDrop) = estimate(labelCounts) labelCounts.unpersist() @@ -97,7 +96,7 @@ class DataCutter(uid: String = UID[DataCutter]) extends Splitter(uid = uid) with resKeep } else getLabelsToKeep.toSet - val dataUse = data.filter(r => keep.contains(r._1)) + val dataUse = data.filter(r => keep.contains(r.getDouble(0))) val labelsMeta = new MetadataBuilder() .putDoubleArray(ModelSelectorBaseNames.LabelsKept, getLabelsToKeep) @@ -127,7 +126,7 @@ class DataCutter(uid: String = UID[DataCutter]) extends Splitter(uid = uid) with val labelSet = labelsKeep.toSet val labelsDropped = labelCounts.filter(r => !labelSet.contains(r.getDouble(0))).collect().map(_.getDouble(0)).toSet - if (labelSet.size > 1) { + if (labelSet.nonEmpty) { log.info(s"DataCutter is keeping labels: $labelSet and dropping labels: $labelsDropped") } else { throw new RuntimeException(s"DataCutter dropped all labels with param settings:" + diff --git a/core/src/main/scala/com/salesforce/op/stages/impl/tuning/DataSplitter.scala b/core/src/main/scala/com/salesforce/op/stages/impl/tuning/DataSplitter.scala index 6d64b7e858..5666451f7e 100644 --- a/core/src/main/scala/com/salesforce/op/stages/impl/tuning/DataSplitter.scala +++ b/core/src/main/scala/com/salesforce/op/stages/impl/tuning/DataSplitter.scala @@ -32,9 +32,8 @@ package com.salesforce.op.stages.impl.tuning import com.salesforce.op.UID -import com.salesforce.op.stages.impl.tuning.SelectorData.LabelFeaturesKey import org.apache.spark.ml.param._ -import org.apache.spark.sql.Dataset +import org.apache.spark.sql.{Dataset, Row} import org.apache.spark.sql.types.MetadataBuilder case object DataSplitter { @@ -70,7 +69,7 @@ class DataSplitter(uid: String = UID[DataSplitter]) extends Splitter(uid = uid) * @param data * @return Training set test set */ - def prepare(data: Dataset[LabelFeaturesKey]): ModelData = + def prepare(data: Dataset[Row]): ModelData = new ModelData(data, new MetadataBuilder()) override def copy(extra: ParamMap): DataSplitter = { diff --git a/core/src/main/scala/com/salesforce/op/stages/impl/tuning/OpCrossValidation.scala b/core/src/main/scala/com/salesforce/op/stages/impl/tuning/OpCrossValidation.scala index 72fae3647f..f062ff0b1d 100644 --- a/core/src/main/scala/com/salesforce/op/stages/impl/tuning/OpCrossValidation.scala +++ b/core/src/main/scala/com/salesforce/op/stages/impl/tuning/OpCrossValidation.scala @@ -22,22 +22,17 @@ package com.salesforce.op.stages.impl.tuning import com.github.fommil.netlib.BLAS import com.salesforce.op.evaluators.OpEvaluatorBase -import org.apache.spark.ml.{Estimator, Model} -import org.apache.spark.ml.param.ParamMap -import org.apache.spark.sql.types.StructType -import com.salesforce.op.stages.impl.selector.{ModelInfo, ModelSelectorBaseNames, StageParamNames} -import com.salesforce.op.stages.impl.tuning.SelectorData.LabelFeaturesKey -import org.apache.spark.mllib.util.MLUtils -import org.apache.spark.sql.{Dataset, Row} +import com.salesforce.op.stages.impl.selector.{ModelInfo, ModelSelectorBaseNames} +import com.salesforce.op.utils.stages.FitStagesUtil._ import com.twitter.algebird.Monoid._ import com.twitter.algebird.Operators._ +import org.apache.spark.ml.{Estimator, Model} +import org.apache.spark.mllib.util.MLUtils import org.apache.spark.rdd.RDD -import org.apache.spark.ml.linalg.Vector +import org.apache.spark.sql.{Dataset, Row, SparkSession} -import scala.collection.parallel.mutable.ParArray - -private[impl] class OpCrossValidation[M <: Model[_], E <: Estimator[_]] +private[op] class OpCrossValidation[M <: Model[_], E <: Estimator[_]] ( val numFolds: Int = ValidatorParamDefaults.NumFolds, val seed: Long = ValidatorParamDefaults.Seed, @@ -49,11 +44,11 @@ private[impl] class OpCrossValidation[M <: Model[_], E <: Estimator[_]] private val blas = BLAS.getInstance() private def findBestModel( - folds: ParArray[(E, Array[Double], Array[ParamMap])] + folds: Seq[ValidatedModel[E]] ): ValidatedModel[E] = { - val metrics = folds.map(_._2).reduce(_ + _) + val metrics = folds.map(_.metrics).reduce(_ + _) blas.dscal(metrics.length, 1.0 / numFolds, metrics, 1) - val (est, _, grid) = folds.head + val ValidatedModel(est, _, _, grid) = folds.head log.info(s"Average cross-validation for $est metrics: {}", metrics.toSeq.mkString(",")) val (bestMetric, bestIndex) = if (evaluator.isLargerBetter) metrics.zipWithIndex.maxBy(_._1) @@ -64,111 +59,106 @@ private[impl] class OpCrossValidation[M <: Model[_], E <: Estimator[_]] } // TODO use futures to parallelize https://github.com/apache/spark/commit/16c4c03c71394ab30c8edaf4418973e1a2c5ebfe - private[op] def validate( + private[op] override def validate[T]( modelInfo: Seq[ModelInfo[E]], - dataset: Dataset[_], + dataset: Dataset[T], label: String, - features: String - ): BestModel[M] = { - - // get param that stores the label column - val labelCol = evaluator.getParam(ValidatorParamDefaults.labelCol) - evaluator.set(labelCol, label) - - val sparkSession = dataset.sparkSession - import sparkSession.implicits._ - val rdd = dataset.as[LabelFeaturesKey].rdd.persist() + features: String, + dag: Option[StagesDAG] = None, + splitter: Option[Splitter] = None, + stratifyCondition: Boolean = isClassification && stratify + )(implicit spark: SparkSession): BestEstimator[E] = { + dataset.persist() + val schema = dataset.schema // creating k train/validation data - val splits: Array[(RDD[Row], RDD[Row])] = createTrainValidationSplits(rdd) - - - val schema = dataset.schema - val newSchema = StructType(schema.dropRight(1)) // dropping key - - val modelWithGrid = modelInfo.map(m => (m.sparkEstimator, m.grid.build(), m.modelName)) - - val fitSummary = splits.zipWithIndex.par.flatMap { - case ((training, validation), splitIndex) => - - log.info(s"Cross Validation $splitIndex with multiple sets of parameters.") - val trainingDataset = sparkSession.createDataFrame(training, newSchema).persist() - val validationDataset = sparkSession.createDataFrame(validation, newSchema).persist() - - val summary = modelWithGrid.map { - case (estimator, paramGrids, name) => - val pi1 = estimator.getParam(StageParamNames.inputParam1Name) - val pi2 = estimator.getParam(StageParamNames.inputParam2Name) - estimator.set(pi1, label).set(pi2, features) - - val numModels = paramGrids.length - val metrics = new Array[Double](paramGrids.length) - - // multi-model training - val models = estimator.fit(trainingDataset, paramGrids).asInstanceOf[Seq[M]] - var i = 0 - while (i < numModels) { - val metric = evaluator.evaluate(models(i).transform(validationDataset, paramGrids(i))) - log.debug(s"Got metric $metric for $name trained with ${paramGrids(i)}.") - metrics(i) = metric - i += 1 - } - (estimator, metrics, paramGrids) + val splits: Array[(RDD[Row], RDD[Row])] = createTrainValidationSplits( + stratifyCondition = stratifyCondition, + dataset = dataset, + label = label, + splitter = splitter + ) + + val modelsWithGrids = modelInfo.map(m => (m.sparkEstimator, m.grid.build(), m.modelName)) + + // TODO use futures to parallelize https://github.com/apache/spark/commit/16c4c03c71394ab30c8edaf4418973e1a2c5ebfe + val groupedSummary = suppressLoggingForFun() { + splits.zipWithIndex.flatMap { + case ((training, validation), splitIndex) => { + log.info(s"Cross Validation $splitIndex with multiple sets of parameters.") + val trainingDataset = spark.createDataFrame(training, schema) + val validationDataset = spark.createDataFrame(validation, schema) + val (newTrain, newTest) = dag.map(theDAG => + // If there is a CV DAG, then run it + applyDAG( + dag = theDAG, + training = trainingDataset, + validation = validationDataset, + label = label, + features = features, + splitter = splitter + ) + ).getOrElse(trainingDataset, validationDataset) + getSummary(modelsWithGrids = modelsWithGrids, label = label, features = features, + train = newTrain, test = newTest) } - trainingDataset.unpersist() - validationDataset.unpersist() - summary + }.groupBy(_.model).map{ case (_, folds) => findBestModel(folds) }.toArray } - rdd.unpersist() - - val groupedSummary = fitSummary.groupBy(_._1).map { case (_, folds) => findBestModel(folds) }.toArray + dataset.unpersist() - val model = - if (evaluator.isLargerBetter) groupedSummary.maxBy(_.bestMetric) - else groupedSummary.minBy(_.bestMetric) - - val bestModel = model.model.fit(dataset, model.bestGrid).asInstanceOf[M] - wrapBestModel(groupedSummary, bestModel, s"$numFolds folds") + val model = getValidatedModel(groupedSummary) + wrapBestEstimator(groupedSummary, model.model.copy(model.bestGrid).asInstanceOf[E], s"$numFolds folds") } // TODO : Implement our own kFold method for better performance in a separate PR /** * Creates Train Validation Splits For CV - * @param rdd + * + * @param stratifyCondition condition to do stratify cv + * @param dataset dataset to split + * @param label name of label in data + * @param splitter used to estimate splitter params prior to cv * @return Array((TrainRDD, ValidationRDD), Index) */ - private[op] override def createTrainValidationSplits( - rdd: RDD[(Double, Vector, String)]): Array[(RDD[Row], RDD[Row])] = { - - if (stratify && isClassification) { - log.info(s"Creating $numFolds stratified folds") - val classes = rdd.map(_._1).distinct().collect() - // Creates RDD grouped by classes (0, 1, 2, 3, ..., K) - val rddByClass = classes.map(label => rdd.filter(_._1 == label) - .map { case (label, features, key) => key -> Seq(Row(label, features)) }.reduceByKey(_ ++ _)) - - // Cross Validation's Train/Validation data for each class - val foldsByClass = rddByClass.map { case rdd: RDD[(String, Seq[Row])] => { - MLUtils.kFold(rdd, numFolds, seed) - .map { case (rdd1, rdd2) => (rdd1.values.flatMap(identity), rdd2.values.flatMap(identity)) } - } - }.toSeq - - if (foldsByClass.isEmpty) throw new Error("Train Validation Data Grouped by class is empty") - // Merging Train/Validation data one by one - foldsByClass.reduce[Array[(RDD[Row], RDD[Row])]] { - // cv1 and cv2 are arrays of train/validation data - case (cv1: Array[(RDD[Row], RDD[Row])], cv2: Array[(RDD[Row], RDD[Row])]) => - (cv1 zip cv2).map { // zip the two arrays and merge the tuples one by one - case ((train1: RDD[Row], test1: RDD[Row]), (train2: RDD[Row], test2: RDD[Row])) => - (train1.union(train2), test1.union(test2)) - } - } + private[op] override def createTrainValidationSplits[T](stratifyCondition: Boolean, + dataset: Dataset[T], label: String, splitter: Option[Splitter] = None): Array[(RDD[Row], RDD[Row])] = { + + // get param that stores the label column + val labelCol = evaluator.getParam(ValidatorParamDefaults.LabelCol) + evaluator.set(labelCol, label) + + // creating k train/validation data + if (stratifyCondition) { + val rddsByClass = prepareStratification( + dataset = dataset, + message = s"Creating $numFolds stratified folds", + label = label, + splitter = splitter + ) + stratifyKFolds(rddsByClass) } else { - val rddRow = rdd.map { case (label, features, key) => key -> Seq(Row(label, features)) }.reduceByKey(_ ++ _) + val rddRow = dataset.toDF().rdd MLUtils.kFold(rddRow, numFolds, seed) - .map { case (rdd1, rdd2) => (rdd1.values.flatMap(identity), rdd2.values.flatMap(identity)) } + } + } + + + private def stratifyKFolds(rddsByClass: Array[RDD[Row]]): Array[(RDD[Row], RDD[Row])] = { + // Cross Validation's Train/Validation data for each class + val foldsByClass = rddsByClass.map(rdd => MLUtils.kFold(rdd, numFolds, seed)).toSeq + + if (foldsByClass.isEmpty) { + throw new RuntimeException("Dataset is too small for CV forlds selected some empty datasets are created") + } + // Merging Train/Validation data one by one + foldsByClass.reduce[Array[(RDD[Row], RDD[Row])]] { + // cv1 and cv2 are arrays of train/validation data + case (cv1: Array[(RDD[Row], RDD[Row])], cv2: Array[(RDD[Row], RDD[Row])]) => + (cv1 zip cv2).map { // zip the two arrays and merge the tuples one by one + case ((train1: RDD[Row], test1: RDD[Row]), (train2: RDD[Row], test2: RDD[Row])) => + (train1.union(train2), test1.union(test2)) + } } } diff --git a/core/src/main/scala/com/salesforce/op/stages/impl/tuning/OpTrainValidationSplit.scala b/core/src/main/scala/com/salesforce/op/stages/impl/tuning/OpTrainValidationSplit.scala index 6481276fdc..7d79fefddc 100644 --- a/core/src/main/scala/com/salesforce/op/stages/impl/tuning/OpTrainValidationSplit.scala +++ b/core/src/main/scala/com/salesforce/op/stages/impl/tuning/OpTrainValidationSplit.scala @@ -20,17 +20,16 @@ package com.salesforce.op.stages.impl.tuning -import com.salesforce.op.evaluators.{OpBinaryClassificationEvaluatorBase, OpEvaluatorBase, OpMultiClassificationEvaluatorBase} +import com.salesforce.op.evaluators.OpEvaluatorBase import com.salesforce.op.stages.impl.selector.{ModelInfo, ModelSelectorBaseNames, StageParamNames} -import com.salesforce.op.stages.impl.tuning.SelectorData.LabelFeaturesKey -import org.apache.spark.ml.linalg.Vector +import com.salesforce.op.utils.stages.FitStagesUtil._ import org.apache.spark.ml.{Estimator, Model} import org.apache.spark.rdd.RDD -import org.apache.spark.sql.{Dataset, Row} import org.apache.spark.sql.types.StructType +import org.apache.spark.sql.{Dataset, Row, SparkSession} -private[impl] class OpTrainValidationSplit[M <: Model[_], E <: Estimator[_]] +private[op] class OpTrainValidationSplit[M <: Model[_], E <: Estimator[_]] ( val trainRatio: Double = ValidatorParamDefaults.TrainRatio, val seed: Long = ValidatorParamDefaults.Seed, @@ -40,104 +39,100 @@ private[impl] class OpTrainValidationSplit[M <: Model[_], E <: Estimator[_]] val validationName: String = ModelSelectorBaseNames.TrainValSplitResults - private[op] def validate( + private[op] override def validate[T]( modelInfo: Seq[ModelInfo[E]], - dataset: Dataset[_], + dataset: Dataset[T], label: String, - features: String - ): BestModel[M] = { - // get param that stores the label column - val labelCol = evaluator.getParam(ValidatorParamDefaults.labelCol) - evaluator.set(labelCol, label) + features: String, + dag: Option[StagesDAG] = None, + splitter: Option[Splitter] = None, + stratifyCondition: Boolean = isClassification && stratify + )(implicit spark: SparkSession): BestEstimator[E] = { + dataset.persist() val schema = dataset.schema - import dataset.sparkSession.implicits._ - val rdd = dataset.as[LabelFeaturesKey].rdd.persist() - - val (trainingRDD, validationRDD) = createTrainValidationSplits(rdd).head - val sparkSession = dataset.sparkSession - val newSchema = StructType(schema.dropRight(1)) // dropping key - val trainingDataset = sparkSession.createDataFrame(trainingRDD, newSchema).persist() - val validationDataset = sparkSession.createDataFrame(validationRDD, newSchema).persist() - - // multi-model training - val modelWithGrid = modelInfo.map(m => (m.sparkEstimator, m.grid.build(), m.modelName)) - val groupedSummary = modelWithGrid.par.map { - case (estimator, paramGrids, name) => - val pi1 = estimator.getParam(StageParamNames.inputParam1Name) - val pi2 = estimator.getParam(StageParamNames.inputParam2Name) - estimator.set(pi1, label).set(pi2, features) - - val numModels = paramGrids.length - val metrics = new Array[Double](paramGrids.length) - - log.info(s"Train split with multiple sets of parameters.") - val models = estimator.fit(trainingDataset, paramGrids).asInstanceOf[Seq[M]] - var i = 0 - while (i < numModels) { - val metric = evaluator.evaluate(models(i).transform(validationDataset, paramGrids(i))) - log.info(s"Got metric $metric for model $name trained with ${paramGrids(i)}.") - metrics(i) = metric - i += 1 - } - log.info(s"Train validation split for $name metrics: {}", metrics.toSeq.mkString(",")) - val (bestMetric, bestIndex) = - if (evaluator.isLargerBetter) metrics.zipWithIndex.maxBy(_._1) - else metrics.zipWithIndex.minBy(_._1) - log.info(s"Best set of parameters:\n${paramGrids(bestIndex)} for $name") - log.info(s"Best train validation split metric: $bestMetric.") - - ValidatedModel(estimator, bestIndex, metrics, paramGrids) + val (training, validation) = createTrainValidationSplits( + stratifyCondition = stratifyCondition, + dataset = dataset, + label = label, + splitter = splitter + ).head + + val trainingDataset = dataset.sparkSession.createDataFrame(training, schema) + val validationDataset = dataset.sparkSession.createDataFrame(validation, schema) + + // If there is a TS DAG, then run it + val (newTrain, newTest) = suppressLoggingForFun() { + dag.map(theDAG => applyDAG( + dag = theDAG, + training = trainingDataset, + validation = validationDataset, + label = label, + features = features, + splitter = splitter + )).getOrElse(trainingDataset, validationDataset) } - trainingDataset.unpersist() - validationDataset.unpersist() - rdd.unpersist() + // multi-model training + val modelsWithGrids = modelInfo.map(m => (m.sparkEstimator, m.grid.build(), m.modelName)) + + val groupedSummary = getSummary( + modelsWithGrids = modelsWithGrids, label = label, features = features, + train = newTrain, test = newTest + ) - val model = - if (evaluator.isLargerBetter) groupedSummary.maxBy(_.bestMetric) - else groupedSummary.minBy(_.bestMetric) + dataset.unpersist() - val bestModel = model.model.fit(dataset, model.bestGrid).asInstanceOf[M] - wrapBestModel(groupedSummary.toArray, bestModel, s"$trainRatio training split") + val model = getValidatedModel(groupedSummary) + wrapBestEstimator(groupedSummary, model.model.copy(model.bestGrid).asInstanceOf[E], s"$trainRatio training split") } - // TODO : Implement our own startified split method for better performance in a separate PR /** * Creates Train Validation Splits For TS - * @param rdd - * @return + * + * @param stratifyCondition condition to do stratify ts + * @param dataset dataset to split + * @param label name of label in dataset + * @param splitter used to estimate splitter params prior to ts + * @return Array[(Train, Test)] */ - private[op] override def createTrainValidationSplits( - rdd: RDD[(Double, Vector, String)]): Array[(RDD[Row], RDD[Row])] = { - - val Array(trainData, validateData) = { - if (stratify && isClassification) { - log.info(s"Creating stratified train/validation with training ratio of $trainRatio") - - val classes = rdd.map(_._1).distinct().collect() - // Creates RDD grouped by classes (0, 1, 2, 3, ..., K) - val rddByClass = classes.map(label => rdd.filter(_._1 == label) - .map { case (label, features, key) => key -> Seq(Row(label, features)) }.reduceByKey(_ ++ _)) - - // Train/Validation data for each class - val splitByClass = rddByClass.map(_.randomSplit(Array(trainRatio, 1 - trainRatio), seed) - .map(_.values.flatMap(identity))) - - if (splitByClass.isEmpty) throw new Error("Train Validation Data Grouped by class is empty") - // Merging Train/Validation data one by one - splitByClass.reduce[Array[RDD[Row]]] { - case (Array(train1: RDD[Row], validate1: RDD[Row]), Array(train2: RDD[Row], validate2: RDD[Row])) => - Array(train1.union(train2), validate1.union(validate2)) - } - - } else { - rdd.map { case (label, features, key) => key -> Seq(Row(label, features)) } - .reduceByKey(_ ++ _) - .randomSplit(Array(trainRatio, 1 - trainRatio), seed) - .map(_.values.flatMap(identity)) - } + private[op] override def createTrainValidationSplits[T]( + stratifyCondition: Boolean, + dataset: Dataset[T], + label: String, + splitter: Option[Splitter] = None + ): Array[(RDD[Row], RDD[Row])] = { + + // get param that stores the label column + val labelCol = evaluator.getParam(ValidatorParamDefaults.LabelCol) + evaluator.set(labelCol, label) + + val Array(train, test) = if (stratifyCondition) { + val rddsByClass = prepareStratification( + dataset = dataset, + message = s"Creating stratified train/validation with training ratio of $trainRatio", + label = label, + splitter = splitter + ) + stratifyTrainValidationSplit(rddsByClass) + } else { + val rddRow = dataset.toDF().rdd + rddRow.randomSplit(Array(trainRatio, 1 - trainRatio), seed) } - Array((trainData, validateData)) + Array((train, test)) } + + private def stratifyTrainValidationSplit(rddsByClass: Array[RDD[Row]]): Array[RDD[Row]] = { + // Train/Validation data for each class + val splitByClass = rddsByClass.map(_.randomSplit(Array(trainRatio, 1 - trainRatio), seed)) + + if (splitByClass.isEmpty) throw new Error("Train Validation Data Grouped by class is empty") + // Merging Train/Validation data one by one + splitByClass.reduce[Array[RDD[Row]]] { + case (Array(train1: RDD[Row], validate1: RDD[Row]), Array(train2: RDD[Row], validate2: RDD[Row])) => + Array(train1.union(train2), validate1.union(validate2)) + } + } + } + diff --git a/core/src/main/scala/com/salesforce/op/stages/impl/tuning/OpValidator.scala b/core/src/main/scala/com/salesforce/op/stages/impl/tuning/OpValidator.scala index 0bde386e32..5a5f284f8e 100644 --- a/core/src/main/scala/com/salesforce/op/stages/impl/tuning/OpValidator.scala +++ b/core/src/main/scala/com/salesforce/op/stages/impl/tuning/OpValidator.scala @@ -31,16 +31,20 @@ package com.salesforce.op.stages.impl.tuning +import com.salesforce.op.utils.stages.FitStagesUtil._ +import com.salesforce.op.utils.stages.FitStagesUtil import com.salesforce.op.evaluators.{OpBinaryClassificationEvaluatorBase, OpEvaluatorBase, OpMultiClassificationEvaluatorBase} -import com.salesforce.op.stages.impl.selector.ModelInfo -import org.apache.spark.ml.linalg.Vector +import com.salesforce.op.stages.impl.selector.{ModelInfo, ModelSelectorBaseNames, StageParamNames} +import org.apache.log4j.{Level, LogManager} import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.{Estimator, Model} import org.apache.spark.rdd.RDD -import org.apache.spark.sql.{Dataset, Row} -import org.apache.spark.sql.types.MetadataBuilder +import org.apache.spark.sql.functions.monotonically_increasing_id +import org.apache.spark.sql.types.{MetadataBuilder, StructType} +import org.apache.spark.sql.{Dataset, Row, SparkSession, functions} import org.slf4j.{Logger, LoggerFactory} + /** * Best Model container * @@ -54,9 +58,9 @@ case class BestModel[M <: Model[_]](name: String, model: M, metadata: Option[Met /** * Best Estimator container * - * @param name the name of the best model - * @param estimator best estimator - * @param metadata optional metadata + * @param name the name of the best model + * @param estimator best estimator + * @param metadata optional metadata * @tparam E model type */ case class BestEstimator[E <: Estimator[_]](name: String, estimator: E, metadata: MetadataBuilder = new MetadataBuilder) @@ -80,6 +84,7 @@ private[tuning] case class ValidatedModel[E <: Estimator[_]] * Best metric (metric at bestIndex) */ def bestMetric: Double = metrics(bestIndex) + /** * Best grid (param grid at bestIndex) */ @@ -94,9 +99,14 @@ private[impl] trait OpValidator[M <: Model[_], E <: Estimator[_]] extends Serial @transient protected lazy val log: Logger = LoggerFactory.getLogger(this.getClass) + type ModelWithGrids = Seq[(E, Array[ParamMap], String)] + def seed: Long + def evaluator: OpEvaluatorBase[_] + def validationName: String + def stratify: Boolean private[op] final def isClassification = evaluator match { @@ -105,35 +115,44 @@ private[impl] trait OpValidator[M <: Model[_], E <: Estimator[_]] extends Serial case _ => false } + /** * Function that performs the model selection * - * @param modelInfo estimators and grids to validate + * @param modelInfo + * @param dataset * @param label * @param features - * @param dataset - * @return estimator + * @param dag + * @param splitter + * @param stratifyCondition Condition to stratify CV/TS + * @param spark + * @return */ - private[op] def validate( + private[op] def validate[T]( modelInfo: Seq[ModelInfo[E]], - dataset: Dataset[_], + dataset: Dataset[T], label: String, - features: String - ): BestModel[M] + features: String, + dag: Option[StagesDAG] = None, + splitter: Option[Splitter] = None, + stratifyCondition: Boolean = isClassification && stratify + )(implicit spark: SparkSession): BestEstimator[E] + /** * Get the best model and the metadata with with the validator params * - * @param modelsFit info from validation - * @param bestModel best fit model - * @param splitInfo split info for logging + * @param modelsFit info from validation + * @param bestEstimator best fit model + * @param splitInfo split info for logging * @return best model */ - private[op] def wrapBestModel( + private[op] def wrapBestEstimator( modelsFit: Array[ValidatedModel[E]], - bestModel: M, + bestEstimator: E, splitInfo: String - ): BestModel[M] = { + ): BestEstimator[E] = { log.info( "Model Selection over {} with {} with {} and the {} metric", modelsFit.map(_.model.getClass.getSimpleName).mkString(","), validationName, splitInfo, evaluator.name @@ -143,16 +162,18 @@ private[impl] trait OpValidator[M <: Model[_], E <: Estimator[_]] extends Serial val newMeta = new MetadataBuilder().putMetadata(validationName, meta.build()) val (bestModelName, _) = if (evaluator.isLargerBetter) cvFittedModels.maxBy(_._2) else cvFittedModels.minBy(_._2) - BestModel(name = bestModelName, model = bestModel, metadata = Option(newMeta)) + BestEstimator(name = bestModelName, estimator = bestEstimator, metadata = newMeta) } /** * Update metadata during model selection and return best model name + * * @return best model name */ private[op] def updateBestModelMetadata(metadataBuilder: MetadataBuilder, v: ValidatedModel[E]): String = { val ValidatedModel(model, bestIndex, metrics, grids) = v val modelParams = model.extractParamMap() + def makeModelName(index: Int) = s"${model.uid}_$index" for {((paramGrid, met), ind) <- grids.zip(metrics).zipWithIndex} { @@ -167,17 +188,140 @@ private[impl] trait OpValidator[M <: Model[_], E <: Estimator[_]] extends Serial makeModelName(bestIndex) } + /** * Creates Train Validation Splits - * @param rdd - * @return Train Validation Splits + * + * @param stratifyCondition condition to stratify splits + * @param dataset + * @param label + * @param splitter used to estimate splitter params prior to splits + * @return + */ + private[op] def createTrainValidationSplits[T](stratifyCondition: Boolean, + dataset: Dataset[T], label: String, splitter: Option[Splitter] = None): Array[(RDD[Row], RDD[Row])] + + + protected def prepareStratification[T]( + dataset: Dataset[T], + message: String, + label: String, + splitter: Option[Splitter] = None + ): Array[RDD[Row]] = { + log.info(message) + import dataset.sqlContext.implicits._ + val classes = dataset.select(label).as[Double].distinct().collect().sorted + val datasetsByClass = classes.map(theClass => dataset.filter(functions.col(label) === theClass)) + + splitter.map { + case d: DataBalancer => { + val Array(negative, positive) = datasetsByClass + d.estimate( + data = dataset, + positiveData = positive, + negativeData = negative, + seed = d.getSeed + ) + } + case c: DataCutter => { + val labelCounts = dataset.sparkSession.createDataFrame(classes zip datasetsByClass.map(_.count())).persist + c.estimate(labelCounts) + labelCounts.unpersist + } + case _ => + } + // Creates RDD grouped by classes (0, 1, 2, 3, ..., K) + datasetsByClass.map(_.toDF().rdd) + } + + protected def applyDAG( + dag: StagesDAG, + training: Dataset[Row], + validation: Dataset[Row], + label: String, + features: String, + splitter: Option[Splitter] + )(implicit sparkSession: SparkSession): (Dataset[Row], Dataset[Row]) = { + import sparkSession.implicits._ + + val FittedDAG(newTrain, newTest, _) = FitStagesUtil.fitAndTransformDAG( + dag = dag, + train = training, + test = validation, + hasTest = true, + indexOfLastEstimator = Some(-1) + ) + val selectTrain = newTrain.select(label, features) + .withColumn(ModelSelectorBaseNames.idColName, monotonically_increasing_id()) + + val selectTest = newTest.select(label, features) + .withColumn(ModelSelectorBaseNames.idColName, monotonically_increasing_id()) + + val (balancedTrain, balancedTest) = splitter.map(s => ( + s.prepare(selectTrain).train, + s.prepare(selectTest).train) + ).getOrElse((selectTrain, selectTest)) + + (balancedTrain, balancedTest) + } + + /** + * Suppress logging to a specified level when executing method `f`. */ - private[op] def createTrainValidationSplits(rdd: RDD[(Double, Vector, String)]): Array[(RDD[Row], RDD[Row])] + protected def suppressLoggingForFun[Result](level: Level = Level.ERROR)(f: => Result): Result = { + val opLog = LogManager.getLogger("com.salesforce.op") + val originalLevel = opLog.getLevel + opLog.setLevel(level) + val result = f + opLog.setLevel(originalLevel) // Reset log level back to normal + result + } + + protected def getValidatedModel(groupedSummary: Array[ValidatedModel[E]]): ValidatedModel[E] = { + if (evaluator.isLargerBetter) groupedSummary.maxBy(_.bestMetric) else groupedSummary.minBy(_.bestMetric) + } + + protected def getSummary[T]( + modelsWithGrids: ModelWithGrids, label: String, features: String, train: Dataset[T], test: Dataset[T] + ): Array[ValidatedModel[E]] = { + train.persist() + test.persist() + val summary = modelsWithGrids.par.map { + case (estimator, paramGrids, name) => + val pi1 = estimator.getParam(StageParamNames.inputParam1Name) + val pi2 = estimator.getParam(StageParamNames.inputParam2Name) + estimator.set(pi1, label).set(pi2, features) + + val numModels = paramGrids.length + val metrics = new Array[Double](paramGrids.length) + + log.info(s"Train split with multiple sets of parameters.") + val models = estimator.fit(train, paramGrids).asInstanceOf[Seq[M]] + var i = 0 + while (i < numModels) { + val metric = evaluator.evaluate(models(i).transform(test, paramGrids(i))) + log.info(s"Got metric $metric for model $name trained with ${paramGrids(i)}.") + metrics(i) = metric + i += 1 + } + val (bestMetric, bestIndex) = + if (evaluator.isLargerBetter) metrics.zipWithIndex.maxBy(_._1) + else metrics.zipWithIndex.minBy(_._1) + log.info(s"Best set of parameters:\n${paramGrids(bestIndex)} for $name") + log.info(s"Best train validation split metric: $bestMetric.") + + ValidatedModel(estimator, bestIndex, metrics, paramGrids) + }.toArray + train.unpersist() + test.unpersist() + summary + } + } object ValidatorParamDefaults { def Seed: Long = util.Random.nextLong // scalastyle:off method.name - val labelCol = "labelCol" + val LabelCol = "labelCol" val NumFolds = 3 val TrainRatio = 0.75 val Stratify = false diff --git a/core/src/main/scala/com/salesforce/op/stages/impl/tuning/Splitter.scala b/core/src/main/scala/com/salesforce/op/stages/impl/tuning/Splitter.scala index ef35eabe9c..1bdface8b9 100644 --- a/core/src/main/scala/com/salesforce/op/stages/impl/tuning/Splitter.scala +++ b/core/src/main/scala/com/salesforce/op/stages/impl/tuning/Splitter.scala @@ -31,17 +31,10 @@ package com.salesforce.op.stages.impl.tuning -import org.apache.spark.ml.linalg.Vector import org.apache.spark.ml.param._ -import org.apache.spark.sql.Dataset +import org.apache.spark.sql.{Dataset, Row} import org.apache.spark.sql.types.{Metadata, MetadataBuilder} -/** - * Case class of data used in model selectors for data prep and cross validation - */ -case object SelectorData { - type LabelFeaturesKey = (Double, Vector, String) -} /** * Case class for Training & test sets @@ -49,8 +42,8 @@ case object SelectorData { * @param train training set is persisted at construction * @param metadata metadata built at construction */ -case class ModelData private(train: Dataset[_], metadata: Metadata) { - def this(train: Dataset[_], metadata: MetadataBuilder) = +case class ModelData private(train: Dataset[Row], metadata: Metadata) { + def this(train: Dataset[Row], metadata: MetadataBuilder) = this(train.persist(), metadata.build()) } @@ -65,7 +58,7 @@ abstract class Splitter(val uid: String) extends SplitterParams { * @param data * @return (dataTrain, dataTest) */ - def split(data: Dataset[_]): (Dataset[_], Dataset[_]) = { + def split[T](data: Dataset[T]): (Dataset[T], Dataset[T]) = { val fraction = 1.0 - getReserveTestFraction val Array(dataTrain, dataTest) = data.randomSplit(Array(fraction, 1.0 - fraction), seed = $(seed)) dataTrain -> dataTest @@ -78,7 +71,7 @@ abstract class Splitter(val uid: String) extends SplitterParams { * @param data * @return Training set test set */ - def prepare(data: Dataset[SelectorData.LabelFeaturesKey]): ModelData + def prepare(data: Dataset[Row]): ModelData } diff --git a/core/src/main/scala/com/salesforce/op/stages/sparkwrappers/generic/SwThreeStageBinaryEstimator.scala b/core/src/main/scala/com/salesforce/op/stages/sparkwrappers/generic/SwThreeStageBinaryEstimator.scala deleted file mode 100644 index bf76a6d848..0000000000 --- a/core/src/main/scala/com/salesforce/op/stages/sparkwrappers/generic/SwThreeStageBinaryEstimator.scala +++ /dev/null @@ -1,260 +0,0 @@ -/* - * Copyright (c) 2017, Salesforce.com, Inc. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - * - * 3. Neither the name of Salesforce.com nor the names of its contributors may - * be used to endorse or promote products derived from this software without - * specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ - -package com.salesforce.op.stages.sparkwrappers.generic - -import com.salesforce.op.UID -import com.salesforce.op.features.FeatureLike -import com.salesforce.op.features.types.FeatureType -import com.salesforce.op.stages.{OpPipelineStage2to3, _} -import org.apache.spark.ml.{Estimator, Model} -import org.apache.spark.sql._ - -import scala.reflect.runtime.universe.TypeTag - -/** - * Generic wrapper for any spark estimator which has two inputs and three outputs - * - * @param inputParam1Name name of spark parameter that sets the first input column - * @param inputParam2Name name of spark parameter that sets the second input column - * @param outputParam1Name name of spark parameter that sets the first output column - * @param outputParam2Name name of spark parameter that sets the second output column - * @param outputParam3Name name of spark parameter that sets the third output column - * @param stage1OperationName unique name of the operation first stage performs - * @param stage2OperationName unique name of the operation second stage performs - * @param stage3OperationName unique name of the operation third stage performs - * @param sparkMlStageIn instance of spark estimator to wrap - * @param uid stage uid - * @param i1ttag type tag for first input - * @param i2ttag type tag for second input - * @param o1ttag type tag for first output - * @param o2ttag type tag for second output - * @param o3ttag type tag for third output - * @param i1ttiv type tag for first input value - * @param i2ttiv type tag for second input value - * @param o1ttov type tag for first output value - * @param o2ttov type tag for second output value - * @param o3ttov type tag for third output value - * @tparam I1 input feature type 1 - * @tparam I2 input feature type 2 - * @tparam O1 first output feature type - * @tparam O2 second output feature type - * @tparam O3 third output feature type - * @tparam M spark model type returned by spark estimator wrapped - * @tparam E spark estimator to wrap - */ -class SwThreeStageBinaryEstimator[I1 <: FeatureType, I2 <: FeatureType, O1 <: FeatureType, O2 <: FeatureType, -O3 <: FeatureType, M <: Model[M], E <: Estimator[M]] -( - val inputParam1Name: String, - val inputParam2Name: String, - val outputParam1Name: String, - val outputParam2Name: String, - val outputParam3Name: String, - val stage1OperationName: String, - val stage2OperationName: String, - val stage3OperationName: String, - private val sparkMlStageIn: Option[E], - val uid: String = UID[SwThreeStageBinaryEstimator[I1, I2, O1, O2, O3, M, E]] -)( - implicit val i1ttag: TypeTag[I1], - val i2ttag: TypeTag[I2], - val o1ttag: TypeTag[O1], - val o2ttag: TypeTag[O2], - val o3ttag: TypeTag[O3], - val i1ttiv: TypeTag[I1#Value], - val i2ttiv: TypeTag[I2#Value], - val o1ttov: TypeTag[O1#Value], - val o2ttov: TypeTag[O2#Value], - val o3ttov: TypeTag[O3#Value] -) extends Estimator[SwThreeStageBinaryModel[I1, I2, O1, O2, O3, M]] - with OpPipelineStage2to3[I1, I2, O1, O2, O3] with SparkWrapperParams[E] { - - setSparkMlStage(sparkMlStageIn) - set(sparkInputColParamNames, Array(inputParam1Name, inputParam2Name)) - set(sparkOutputColParamNames, Array(outputParam1Name, outputParam2Name, outputParam3Name)) - - private lazy val stage1uid = UID[SwBinaryEstimator[I1, I2, O1, M, E]] - private lazy val stage2uid = UID[SwTernaryTransformer[I1, I2, O1, O2, M]] - private lazy val stage3uid = UID[SwQuaternaryTransformer[I1, I2, O1, O2, O3, M]] - - private lazy val outputName1 = makeOutputNameFromStageId[O1](stage1uid, Seq(in1, in2)) - private lazy val outputName2 = makeOutputNameFromStageId[O2](stage2uid, Seq(in1, in2), 2) - private lazy val outputName3 = makeOutputNameFromStageId[O3](stage3uid, Seq(in1, in2), 3) - - // put together parameter names and values - private lazy val outputs = $(sparkOutputColParamNames).zip( - Array(outputName1, outputName2, outputName3)) - - private[op] lazy val stage1 = new SwBinaryEstimatorSpecial[I1, I2, O1, M, E]( - inputParam1Name = $(sparkInputColParamNames)(0), - inputParam2Name = $(sparkInputColParamNames)(1), - outputParamName = $(sparkOutputColParamNames)(0), - operationName = stage1OperationName, - sparkMlStageIn = getSparkMlStage().map { spk => // set all the outputs for this stage - outputs.foldLeft(spk) { case (s, (pname, pvalue)) => s.set(s.getParam(pname), pvalue) } - }, - uid = stage1uid, - outputs - ).setInput(in1.asFeatureLike[I1], in2.asFeatureLike[I2]) - - private[op] lazy val stage2 = new SwTernaryTransformer[I1, I2, O1, O2, M]( - inputParam1Name = $(sparkInputColParamNames)(0), - inputParam2Name = $(sparkInputColParamNames)(1), - inputParam3Name = stage1OperationName, - outputParamName = $(sparkOutputColParamNames)(1), - operationName = stage2OperationName, - sparkMlStageIn = None, - uid = stage2uid - ).setInput(in1.asFeatureLike[I1], in2.asFeatureLike[I2], stage1.getOutput()) - - private[op] lazy val stage3 = new SwQuaternaryTransformer[I1, I2, O1, O2, O3, M]( - inputParam1Name = $(sparkInputColParamNames)(0), - inputParam2Name = $(sparkInputColParamNames)(1), - inputParam3Name = stage1OperationName, - inputParam4Name = stage2OperationName, - outputParamName = $(sparkOutputColParamNames)(2), - operationName = stage3OperationName, - sparkMlStageIn = None, - uid = stage3uid - ).setInput(in1.asFeatureLike[I1], in2.asFeatureLike[I2], stage1.getOutput(), stage2.getOutput()) - - /** - * Output features that will be created by the transformation - * - * @return features of type O1, O2 and O3 - */ - final override def getOutput(): (FeatureLike[O1], FeatureLike[O2], FeatureLike[O3]) = { - (stage1.getOutput(), stage2.getOutput(), stage3.getOutput()) - } - - override def fit(dataset: Dataset[_]): SwThreeStageBinaryModel[I1, I2, O1, O2, O3, M] = { - val model = stage1.fit(dataset) - - new SwThreeStageBinaryModel[I1, I2, O1, O2, O3, M]( - inputParam1Name, - inputParam2Name, - outputParam1Name, - outputParam2Name, - outputParam3Name, - stage1OperationName, - stage2OperationName, - stage3OperationName, - model, - stage2, - stage3, - uid - ).setParent(this).setInput(in1.asFeatureLike[I1], in2.asFeatureLike[I2]) - - } -} - -/** - * Generic wrapper for any model returned by an estimator which has two inputs and three outputs - * - * @param inputParam1Name name of spark parameter that sets the first input column - * @param inputParam2Name name of spark parameter that sets the second input column - * @param outputParam1Name name of spark parameter that sets the first output column - * @param outputParam2Name name of spark parameter that sets the second output column - * @param outputParam3Name name of spark parameter that sets the third output column - * @param stage1OperationName unique name of the operation first stage performs - * @param stage2OperationName unique name of the operation second stage performs - * @param stage3OperationName unique name of the operation third stage performs - * @param stage1 first wrapping stage for output one (this is the only stage that actually does anything) - * @param stage2 second stage - dummy for generating second output - * @param stage3 third stage - dummy for generating third output - * @param uid stage uid - * @tparam I1 input feature type 1 - * @tparam I2 input feature type 2 - * @tparam O1 first output feature type - * @tparam O2 second output feature type - * @tparam O3 third output feature type - * @tparam M - */ -private[stages] final class SwThreeStageBinaryModel[I1 <: FeatureType, I2 <: FeatureType, O1 <: FeatureType, -O2 <: FeatureType, O3 <: FeatureType, M <: Model[M]] -( - val inputParam1Name: String, - val inputParam2Name: String, - val outputParam1Name: String, - val outputParam2Name: String, - val outputParam3Name: String, - val stage1OperationName: String, - val stage2OperationName: String, - val stage3OperationName: String, - val stage1: SwBinaryModel[I1, I2, O1, M], - val stage2: SwTernaryTransformer[I1, I2, O1, O2, M], - val stage3: SwQuaternaryTransformer[I1, I2, O1, O2, O3, M], - val uid: String -) extends Model[SwThreeStageBinaryModel[I1, I2, O1, O2, O3, M]] - with OpPipelineStage2to3[I1, I2, O1, O2, O3] with SparkWrapperParams[M] { - - setSparkMlStage(stage1.getSparkMlStage()) - set(sparkInputColParamNames, Array(inputParam1Name, inputParam2Name)) - set(sparkOutputColParamNames, Array(outputParam1Name, outputParam2Name, outputParam3Name)) - - override def transform(dataset: Dataset[_]): DataFrame = stage1.transform(dataset) - - override def getOutput(): (FeatureLike[O1], FeatureLike[O2], FeatureLike[O3]) = - (stage1.getOutput(), stage2.getOutput(), stage3.getOutput()) -} - -/** - * Wrapper for any spark estimator that has two inputs and three outputs (for use in three stage wrapper) - */ -private[op] class SwBinaryEstimatorSpecial[I1 <: FeatureType, I2 <: FeatureType, O <: FeatureType, -M <: Model[M], E <: Estimator[M]] -( - inputParam1Name: String, - inputParam2Name: String, - outputParamName: String, - operationName: String, - private val sparkMlStageIn: Option[E], - uid: String = UID[SwBinaryEstimator[I1, I2, O, M, E]], - val outputNames: Array[(String, String)] -)( - implicit tti1: TypeTag[I1], - tti2: TypeTag[I2], - tto: TypeTag[O], - ttov: TypeTag[O#Value] -) extends SwBinaryEstimator[I1, I2, O, M, E] (inputParam1Name = inputParam1Name, inputParam2Name = inputParam2Name, - outputParamName = outputParamName, operationName = operationName, sparkMlStageIn = sparkMlStageIn, - uid = uid)(tti1 = tti1, tti2 = tti2, tto = tto, ttov = ttov){ - - override def setOutputFeatureName(m: String): this.type = { - getSparkMlStage().map { spk => // set all the outputs for this stage - outputNames.zipWithIndex.foldLeft(spk) { case (s, ((pname, pvalue), i)) => - val newName = updateOutputName(m, pvalue, i) - s.set(s.getParam(pname), newName) - }} - set(outputFeatureName, m) - } -} diff --git a/core/src/main/scala/com/salesforce/op/stages/sparkwrappers/specific/OpEstimatorWrapper.scala b/core/src/main/scala/com/salesforce/op/stages/sparkwrappers/specific/OpEstimatorWrapper.scala index 67b4bc069f..59b69551a9 100644 --- a/core/src/main/scala/com/salesforce/op/stages/sparkwrappers/specific/OpEstimatorWrapper.scala +++ b/core/src/main/scala/com/salesforce/op/stages/sparkwrappers/specific/OpEstimatorWrapper.scala @@ -41,7 +41,7 @@ import scala.reflect.runtime.universe.TypeTag /** * Wraps a spark ML estimator. This wrapper is meant for Estimators not already covered by more specific - * wrappers such as: [[OpProbabilisticClassifierWrapper]] and [[OpPredictorWrapper]]. + * wrappers such as: [[OpPredictorWrapper]]. * Examples of estimators meant to be wrapped with OpEstimatorWrapper include MinMaxScaler, IDF, VectorIndexer, * CountVectorizer, QuantileDiscretizer, StandardScaler, PCA, MaxAbsScaler, Word2Vec, etc. * Their defining characteristic is that they output a Model which takes in one column as input and output diff --git a/core/src/main/scala/org/apache/spark/ml/classification/OpNaiveBayesModel.scala b/core/src/main/scala/com/salesforce/op/stages/sparkwrappers/specific/OpPredictionModel.scala similarity index 60% rename from core/src/main/scala/org/apache/spark/ml/classification/OpNaiveBayesModel.scala rename to core/src/main/scala/com/salesforce/op/stages/sparkwrappers/specific/OpPredictionModel.scala index 7408c99194..d24f8b852d 100644 --- a/core/src/main/scala/org/apache/spark/ml/classification/OpNaiveBayesModel.scala +++ b/core/src/main/scala/com/salesforce/op/stages/sparkwrappers/specific/OpPredictionModel.scala @@ -29,28 +29,38 @@ * POSSIBILITY OF SUCH DAMAGE. */ -package org.apache.spark.ml.classification +package com.salesforce.op.stages.sparkwrappers.specific -import com.salesforce.op.UID -import com.salesforce.op.features.types.{OPVector, Prediction, RealMap, RealNN} -import org.apache.spark.ml.linalg.{Matrix, Vector} +import com.salesforce.op.features.types.{OPVector, Prediction, RealNN} +import org.apache.spark.ml.PredictionModel +import org.apache.spark.ml.linalg.Vector -import scala.reflect.runtime.universe.TypeTag +import scala.reflect.runtime.universe._ -class OpNaiveBayesModel +/** + * Class that takes in a spark PredictionModel and wraps it into an OP model which returns a + * Prediction feature + * + * @param sparkModel model to wrap + * @param uid uid to give stage + * @param operationName unique name of the operation this stage performs + * @tparam T type of the model to wrap + */ +abstract class OpPredictionModel[T <: PredictionModel[Vector, T]] ( - pi: Vector, - theta: Matrix, - val oldLabelsIn: Array[Double], - val modelTypeIn: String, - uid: String = UID[OpNaiveBayesModel], - val operationName: String = "opNB" -)( - implicit val tti1: TypeTag[RealNN], - val tti2: TypeTag[OPVector], - val tto: TypeTag[Prediction], - val ttov: TypeTag[Prediction#Value] -) extends NaiveBayesModel(uid = uid, pi = pi, theta = theta) with OpClassifierModelBase { - this.oldLabels = oldLabelsIn - set(modelType, modelTypeIn) + sparkModel: T, + uid: String, + operationName: String +) extends OpPredictorWrapperModel[T](uid = uid, operationName = operationName, sparkModel = sparkModel) { + + protected def predictMirror: MethodMirror + + protected def predict(features: Vector): Double = predictMirror.apply(features).asInstanceOf[Double] + + /** + * Function used to convert input to output + */ + override def transformFn: (RealNN, OPVector) => Prediction = (label, features) => + Prediction(prediction = predict(features.value)) + } diff --git a/core/src/main/scala/com/salesforce/op/stages/sparkwrappers/specific/OpPredictorWrapper.scala b/core/src/main/scala/com/salesforce/op/stages/sparkwrappers/specific/OpPredictorWrapper.scala index e2cef41c4a..06b4c4a8de 100644 --- a/core/src/main/scala/com/salesforce/op/stages/sparkwrappers/specific/OpPredictorWrapper.scala +++ b/core/src/main/scala/com/salesforce/op/stages/sparkwrappers/specific/OpPredictorWrapper.scala @@ -29,56 +29,95 @@ * POSSIBILITY OF SUCH DAMAGE. */ -// scalastyle:off package com.salesforce.op.stages.sparkwrappers.specific import com.salesforce.op.UID -import com.salesforce.op.features.types.{FeatureType, OPVector} -import com.salesforce.op.stages.sparkwrappers.generic.SwBinaryEstimator +import com.salesforce.op.features.types.{FeatureType, OPVector, Prediction, RealNN} +import com.salesforce.op.stages.{OpPipelineStage2, SparkStageParam} +import com.salesforce.op.stages.base.binary.{BinaryEstimator, BinaryModel, OpTransformer2} +import com.salesforce.op.stages.sparkwrappers.generic.SparkWrapperParams +import org.apache.spark.ml._ import org.apache.spark.ml.linalg.Vector -import org.apache.spark.ml.param.ParamMap -import org.apache.spark.ml.{PredictionModel, Predictor, SparkMLSharedParamConstants} +import org.apache.spark.sql.Dataset import scala.reflect.runtime.universe.TypeTag /** * Wraps a spark ML predictor. Predictors represent supervised learning algorithms (regression and classification) in - * spark ML that inherit from [[Predictor]], examples of which include: + * spark ML that inherit from [[Predictor]], supported models are: + * [[org.apache.spark.ml.classification.LogisticRegression]] + * [[org.apache.spark.ml.regression.LinearRegression]], + * [[org.apache.spark.ml.classification.RandomForestClassifier]], * [[org.apache.spark.ml.regression.RandomForestRegressor]], - * [[org.apache.spark.ml.regression.GBTRegressor]], [[org.apache.spark.ml.classification.GBTClassifier]], + * [[org.apache.spark.ml.classification.NaiveBayesModel]], + * [[org.apache.spark.ml.classification.GBTClassifier]], + * [[org.apache.spark.ml.regression.GBTRegressor]], + * [[org.apache.spark.ml.classification.DecisionTreeClassifier]] * [[org.apache.spark.ml.regression.DecisionTreeRegressor]], + * [[org.apache.spark.ml.classification.LinearSVC]] * [[org.apache.spark.ml.classification.MultilayerPerceptronClassifier]], - * [[org.apache.spark.ml.regression.LinearRegression]], - * and [[org.apache.spark.ml.regression.GeneralizedLinearRegression]]. + * [[org.apache.spark.ml.regression.GeneralizedLinearRegression]]. * Their defining characteristic is that they output a model which takes in 2 columns as input (labels and features) - * and output one column as result. - * NOTE: Probabilistic classifiers contain additional output information, and so there is a specific wrapper - * for that kind of classifier see: [[OpProbabilisticClassifierWrapper]] + * and output one to three column as result. * * @param predictor the predictor to wrap * @param uid stage uid - * @tparam I the type of the transformation input feature - * @tparam O the type of the transformation output feature - * @tparam E spark estimator to wrap - * @tparam M spark model type returned by spark estimator wrapped + * @tparam E spark estimator to wrap + * @tparam M spark model returned */ -class OpPredictorWrapper[I <: FeatureType, O <: FeatureType, E <: Predictor[Vector, E, M], -M <: PredictionModel[Vector, M]] +class OpPredictorWrapper[E <: Predictor[Vector, E, M], M <: PredictionModel[Vector, M]] ( val predictor: E, - uid: String = UID[OpPredictorWrapper[I, O, E, M]] + val uid: String = UID[OpPredictorWrapper[_, _]] +)( + implicit val tti1: TypeTag[RealNN], + val tti2: TypeTag[OPVector], + val tto: TypeTag[Prediction], + val ttov: TypeTag[Prediction#Value] +) extends Estimator[OpPredictorWrapperModel[M]] with OpPipelineStage2[RealNN, OPVector, Prediction] + with SparkWrapperParams[E] { + + val operationName = predictor.getClass.getSimpleName + val inputParam1Name = SparkMLSharedParamConstants.LabelColName + val inputParam2Name = SparkMLSharedParamConstants.FeaturesColName + val outputParamName = SparkMLSharedParamConstants.PredictionColName + setDefault(sparkMlStage, Option(predictor)) + + /** + * Function that fits the binary model + */ + override def fit(dataset: Dataset[_]): OpPredictorWrapperModel[M] = { + setInputSchema(dataset.schema).transformSchema(dataset.schema) + copyValues(predictor) // when params are shared with wrapping class this will pass them into the model + + val p1 = predictor.getParam(inputParam1Name) + val p2 = predictor.getParam(inputParam2Name) + val po = predictor.getParam(outputParamName) + val model: M = predictor + .set(p1, in1.name) + .set(p2, in2.name) + .set(po, getOutputFeatureName) + .fit(dataset) + + SparkModelConverter.toOP(model, uid) + .setParent(this) + .setInput(in1.asFeatureLike[RealNN], in2.asFeatureLike[OPVector]) + .setMetadata(getMetadata()) + .setOutputFeatureName(getOutputFeatureName) + } +} + +abstract class OpPredictorWrapperModel[M <: PredictionModel[Vector, M]] +( + val operationName: String, + val uid: String, + val sparkModel: M )( - implicit tti1: TypeTag[I], - tto: TypeTag[O], - ttov: TypeTag[O#Value] -) extends SwBinaryEstimator[I, OPVector, O, M, E]( - inputParam1Name = SparkMLSharedParamConstants.LabelColName, - inputParam2Name = SparkMLSharedParamConstants.FeaturesColName, - outputParamName = SparkMLSharedParamConstants.PredictionColName, - operationName = predictor.getClass.getSimpleName, - // cloning below to prevent parameter changes to the underlying classifier outside the wrapper - sparkMlStageIn = Option(predictor).map(_.copy(ParamMap.empty)), - uid = uid -) { - final protected def getSparkStage: E = getSparkMlStage().get + implicit val tti1: TypeTag[RealNN], + val tti2: TypeTag[OPVector], + val tto: TypeTag[Prediction], + val ttov: TypeTag[Prediction#Value] +) extends Model[OpPredictorWrapperModel[M]] with OpTransformer2[RealNN, OPVector, Prediction] + with SparkWrapperParams[M] { + setDefault(sparkMlStage, Option(sparkModel)) } diff --git a/core/src/main/scala/com/salesforce/op/stages/sparkwrappers/specific/OpProbabilisticClassifierModel.scala b/core/src/main/scala/com/salesforce/op/stages/sparkwrappers/specific/OpProbabilisticClassifierModel.scala new file mode 100644 index 0000000000..291dc0bc50 --- /dev/null +++ b/core/src/main/scala/com/salesforce/op/stages/sparkwrappers/specific/OpProbabilisticClassifierModel.scala @@ -0,0 +1,76 @@ +/* + * Copyright (c) 2017, Salesforce.com, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of Salesforce.com nor the names of its contributors may + * be used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +package com.salesforce.op.stages.sparkwrappers.specific + +import com.salesforce.op.features.types._ +import org.apache.spark.ml.classification.ProbabilisticClassificationModel +import org.apache.spark.ml.linalg.Vector + +import scala.reflect.runtime.universe._ + +/** + * Class that takes in a spark ProbabilisticClassifierModel and wraps it into an OP model which returns a + * Prediction feature + * + * @param sparkModel model to wrap + * @param uid uid to give stage + * @param operationName unique name of the operation this stage performs + * @tparam T type of the model to wrap + */ +abstract class OpProbabilisticClassifierModel[T <: ProbabilisticClassificationModel[Vector, T]] +( + sparkModel: T, + uid: String, + operationName: String +) extends OpPredictorWrapperModel[T](uid = uid, operationName = operationName, sparkModel = sparkModel) { + + protected def predictRawMirror: MethodMirror + protected def raw2probabilityMirror: MethodMirror + protected def probability2predictionMirror: MethodMirror + + protected def predictRaw(features: Vector): Vector = predictRawMirror.apply(features).asInstanceOf[Vector] + protected def raw2probability(raw: Vector): Vector = raw2probabilityMirror.apply(raw).asInstanceOf[Vector] + protected def probability2prediction(prob: Vector): Double = + probability2predictionMirror.apply(prob).asInstanceOf[Double] + + /** + * Function used to convert input to output + */ + override def transformFn: (RealNN, OPVector) => Prediction = (label, features) => { + val raw = predictRaw(features.value) + val prob = raw2probability(raw) + val pred = probability2prediction(prob) + + Prediction(rawPrediction = raw, probability = prob, prediction = pred) + } + +} diff --git a/core/src/main/scala/com/salesforce/op/stages/sparkwrappers/specific/OpProbabilisticClassifierWrapper.scala b/core/src/main/scala/com/salesforce/op/stages/sparkwrappers/specific/OpProbabilisticClassifierWrapper.scala deleted file mode 100644 index cec3d62d15..0000000000 --- a/core/src/main/scala/com/salesforce/op/stages/sparkwrappers/specific/OpProbabilisticClassifierWrapper.scala +++ /dev/null @@ -1,76 +0,0 @@ -/* - * Copyright (c) 2017, Salesforce.com, Inc. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - * - * 3. Neither the name of Salesforce.com nor the names of its contributors may - * be used to endorse or promote products derived from this software without - * specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ - -package com.salesforce.op.stages.sparkwrappers.specific - -import com.salesforce.op.UID -import com.salesforce.op.features.types._ -import com.salesforce.op.stages.sparkwrappers.generic.SwThreeStageBinaryEstimator -import org.apache.spark.ml.SparkMLSharedParamConstants -import org.apache.spark.ml.classification.{ProbabilisticClassificationModel, ProbabilisticClassifier} -import org.apache.spark.ml.linalg.Vector -import org.apache.spark.ml.param.ParamMap - -/** - * Wraps a spark ML probabilistic classifier. In SparkML, a probabilistic classifier is anything that inherits - * from [[ProbabilisticClassifier]]. Examples of these probabilistic classifiers - * include: RandomForestClassifier, NaiveBayes, LogisticRegression, and DecisionTreeClassifier. - * These classifiers in spark ML output not a single column, but 3: (1) the raw unnormalized scores for each class, - * (2) the probabilistic classification (normalized raw scores), and - * (3) the labels of the output (e.g. max unnormalized score). - * The defining characteristic of classifiers intended to be wrapped by this class is that they output a model which - * takes in 2 columns as input (label and features) and output 3 columns as result. - * - * @param probClassifier the probabilistic classifier to wrap - * @param uid stage uid - * @tparam E spark estimator to wrap - * @tparam M spark model type returned by spark estimator wrapped - */ -class OpProbabilisticClassifierWrapper[E <: ProbabilisticClassifier[Vector, E, M], -M <: ProbabilisticClassificationModel[Vector, M]] -( - val probClassifier: E, - uid: String = UID[OpProbabilisticClassifierWrapper[E, M]] -) extends SwThreeStageBinaryEstimator[RealNN, OPVector, RealNN, OPVector, OPVector, M, E]( - inputParam1Name = SparkMLSharedParamConstants.LabelColName, - inputParam2Name = SparkMLSharedParamConstants.FeaturesColName, - outputParam1Name = SparkMLSharedParamConstants.PredictionColName, - outputParam2Name = SparkMLSharedParamConstants.RawPredictionColName, - outputParam3Name = SparkMLSharedParamConstants.ProbabilityColName, - stage1OperationName = probClassifier.getClass.getSimpleName + "_" + SparkMLSharedParamConstants.PredictionColName , - stage2OperationName = probClassifier.getClass.getSimpleName + "_" + SparkMLSharedParamConstants.RawPredictionColName, - stage3OperationName = probClassifier.getClass.getSimpleName + "_" + SparkMLSharedParamConstants.ProbabilityColName, - // cloning below to prevent parameter changes to the underlying classifier outside the wrapper - sparkMlStageIn = Option(probClassifier).map(_.copy(ParamMap.empty)), - uid = uid -) { - final protected def getSparkStage: E = getSparkMlStage().get -} diff --git a/core/src/main/scala/com/salesforce/op/stages/sparkwrappers/specific/SparkModelConverter.scala b/core/src/main/scala/com/salesforce/op/stages/sparkwrappers/specific/SparkModelConverter.scala new file mode 100644 index 0000000000..f77d0d1dfa --- /dev/null +++ b/core/src/main/scala/com/salesforce/op/stages/sparkwrappers/specific/SparkModelConverter.scala @@ -0,0 +1,102 @@ +/* + * Copyright (c) 2017, Salesforce.com, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of Salesforce.com nor the names of its contributors may + * be used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +package com.salesforce.op.stages.sparkwrappers.specific + +import com.salesforce.op.features.types.{OPVector, Prediction, RealNN} +import com.salesforce.op.stages.base.binary.OpTransformer2 +import com.salesforce.op.stages.impl.classification._ +import com.salesforce.op.stages.impl.regression._ +import org.apache.spark.ml.classification._ +import org.apache.spark.ml.linalg.Vector +import org.apache.spark.ml.regression._ +import org.apache.spark.ml.{Model, PredictionModel} + +/** + * Allows conversion from spark models to models that follow the OP convention of having a + * transformFn that can be called on a single row rather than the whole dataframe + */ +object SparkModelConverter { + + /** + * Converts supported spark model of type PredictionModel[Vector, T] to an OP model + * @param model model to convert + * @param uid uid to give converted model + * @tparam T type of model to convert + * @return Op Binary Model which will produce the same values put into a Prediction return feature + */ + def toOP[T <: PredictionModel[Vector, T]]( + model: T, + uid: String + ): OpPredictorWrapperModel[T] = { + toOPUnchecked(model, uid).asInstanceOf[OpPredictorWrapperModel[T]] + } + + /** + * Converts supported spark model of type PredictionModel[Vector, T] to an OP model + * @param model model to convert + * @tparam T type of model to convert + * @return Op Binary Model which will produce the same values put into a Prediction return feature + */ + // TODO remove when loco and model selector are updated + def toOPUnchecked[T <: Model[_]](model: T): OpTransformer2[RealNN, OPVector, Prediction] = + toOPUnchecked(model, model.uid) + + /** + * Converts supported spark model of type PredictionModel[Vector, T] to an OP model + * @param model model to convert + * @param uid uid to give converted model + * @tparam T type of model to convert + * @return Op Binary Model which will produce the same values put into a Prediction return feature + */ + // TODO remove when loco and model selector are updated + def toOPUnchecked[T <: Model[_]]( + model: T, + uid: String + ): OpTransformer2[RealNN, OPVector, Prediction] = { + model match { + case m: LogisticRegressionModel => new OpLogisticRegressionModel(m, uid = uid) + case m: RandomForestClassificationModel => new OpRandomForestClassificationModel(m, uid = uid) + case m: NaiveBayesModel => new OpNaiveBayesModel(m, uid) + case m: DecisionTreeClassificationModel => new OpDecisionTreeClassificationModel(m, uid = uid) + case m: GBTClassificationModel => new OpGBTClassificationModel(m, uid = uid) + case m: LinearSVCModel => new OpLinearSVCModel(m, uid = uid) + case m: MultilayerPerceptronClassificationModel => new OpMultilayerPerceptronClassificationModel(m, uid = uid) + case m: LinearRegressionModel => new OpLinearRegressionModel(m, uid = uid) + case m: RandomForestRegressionModel => new OpRandomForestRegressionModel(m, uid = uid) + case m: GBTRegressionModel => new OpGBTRegressionModel(m, uid = uid) + case m: DecisionTreeRegressionModel => new OpDecisionTreeRegressionModel(m, uid = uid) + case m: GeneralizedLinearRegressionModel => new OpGeneralizedLinearRegressionModel(m, uid = uid) + case m => throw new RuntimeException(s"model conversion not implemented for model $m") + } + } + +} diff --git a/core/src/main/scala/com/salesforce/op/utils/stages/FitStagesUtil.scala b/core/src/main/scala/com/salesforce/op/utils/stages/FitStagesUtil.scala index 6f1fda93a6..b43c4f644c 100644 --- a/core/src/main/scala/com/salesforce/op/utils/stages/FitStagesUtil.scala +++ b/core/src/main/scala/com/salesforce/op/utils/stages/FitStagesUtil.scala @@ -31,25 +31,70 @@ package com.salesforce.op.utils.stages -import com.salesforce.op.OpWorkflowModel +import com.salesforce.op.features.OPFeature +import com.salesforce.op.stages.impl.selector.{HasTestEval, ModelSelectorBase} import com.salesforce.op.stages.{OPStage, OpTransformer} -import com.salesforce.op.stages.impl.selector.HasTestEval -import org.apache.spark.ml.{Estimator, Transformer} +import com.salesforce.op.{OpWorkflow, OpWorkflowModel} +import org.apache.spark.ml.{Estimator, Model, Transformer} import org.apache.spark.rdd.RDD -import org.apache.spark.sql.{DataFrame, Row, SparkSession} -import org.slf4j.Logger -import com.salesforce.op.utils.spark.RichDataset._ +import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession} +import org.slf4j.LoggerFactory +import scala.collection.mutable.ListBuffer + +/** + * Functionality for manipulating stages DAG and fitting stages + * + * NOTE: this should be kept private to OP, cause we do not want users to mess up with + * the internal mechanisms of our workflows. + */ private[op] case object FitStagesUtil { + /** + * DAG layer - stages with their distance pairs + */ + type Layer = Array[(OPStage, Int)] + + /** + * Stages DAG - unique stages layered by distance (desc order) + */ + type StagesDAG = Array[Layer] + + /** + * Model Selector type + */ + type MS = ModelSelectorBase[_ <: Model[_], _ <: Estimator[_]] + + /** + * Fitted DAG together with it's trainding & test data + * + * @param trainData train data + * @param testData test data + * @param transformers fitted transformers + */ + case class FittedDAG(trainData: Dataset[Row], testData: Dataset[Row], transformers: Array[OPStage]) + + /** + * Extracted Model Selector and Split the DAG into + * + * @param modelSelector maybe model selector (if any) + * @param before DAG before CV/TS + * @param during DAG during CV/TS + * @param after DAG after CV/TS + */ + case class CutDAG(modelSelector: Option[(MS, Int)], before: StagesDAG, during: StagesDAG, after: StagesDAG) + + private val log = LoggerFactory.getLogger(this.getClass.getName.stripSuffix("$")) + /** * Efficiently apply all op stages - * @param opStages list of op stages to apply - * @param df dataframe to apply them too + * + * @param opStages list of op stages to apply + * @param df dataframe to apply them too * @return new data frame containing columns with output for all stages fed in */ - def applyOpTransformations(opStages: Array[_ <:OPStage with OpTransformer], df: DataFrame) - (implicit spark: SparkSession, log: Logger): DataFrame = { + def applyOpTransformations(opStages: Array[_ <: OPStage with OpTransformer], df: Dataset[Row]) + (implicit spark: SparkSession): Dataset[Row] = { if (opStages.isEmpty) df else { log.info("Applying {} OP stage(s): {}", opStages.length, opStages.map(_.uid).mkString(",")) @@ -87,8 +132,8 @@ private[op] case object FitStagesUtil { * @return Dataframe transformed data */ def applySparkTransformations( - data: DataFrame, transformers: Array[Transformer], persistEveryKStages: Int - )(implicit spark: SparkSession, log: Logger): DataFrame = { + data: Dataset[Row], transformers: Array[Transformer], persistEveryKStages: Int + )(implicit spark: SparkSession): Dataset[Row] = { // you have more than 5 stages and are not persisting at least once if (transformers.length > 5 && persistEveryKStages > transformers.length) { @@ -119,34 +164,110 @@ private[op] case object FitStagesUtil { transformedData } + /** + * Computes stages DAG + * + * @param features array if features in workflow + * @return unique stages layered by distance (desc order) + */ + def computeDAG(features: Array[OPFeature]): StagesDAG = { + val (failures, parents) = features.map(_.parentStages()).partition(_.isFailure) + + if (failures.nonEmpty) { + throw new IllegalArgumentException("Failed to compute stages DAG", failures.head.failed.get) + } + + // Stages sorted by distance + val sortedByDistance: Array[(OPStage, Int)] = parents.flatMap(_.get) + + // Stages layered by distance + val layeredByDistance: StagesDAG = + sortedByDistance.groupBy(_._2).toArray + .map(_._2.sortBy(_._1.getOutputFeatureName)) + .sortBy(s => -s.head._2) + + // Unique stages layered by distance + layeredByDistance + .foldLeft(Set.empty[OPStage], Array.empty[Array[(OPStage, Int)]]) { + case ((seen, filtered), uncleaned) => + // filter out any seen stages. also add distinct to filter out any duplicate stages in layer + val unseen = uncleaned.filterNot(v => seen.contains(v._1)).distinct + val nowSeen = seen ++ unseen.map(_._1) + (nowSeen, filtered :+ unseen) + }._2 + } + + /** + * Fit DAG and apply transformations on data up to the last estimator stage + * + * @param dag DAG to fit + * @param train training dataset + * @param test test dataset + * @param hasTest whether the test dataset is empty or not + * @param indexOfLastEstimator Optional index of the last estimator + * @param persistEveryKStages frequency of persisting stages + * @param fittedTransformers list of already fitted transformers + * @param spark Spark session + * @return Fitted and Transformed train/test before the last estimator with fitted transformers + */ + def fitAndTransformDAG( + dag: StagesDAG, + train: Dataset[Row], + test: Dataset[Row], + hasTest: Boolean, + indexOfLastEstimator: Option[Int], + persistEveryKStages: Int = OpWorkflowModel.PersistEveryKStages, + fittedTransformers: Seq[OPStage] = Seq.empty + )(implicit spark: SparkSession): FittedDAG = { + val alreadyFitted: ListBuffer[OPStage] = ListBuffer(fittedTransformers: _*) + + val (newTrain, newTest) = + dag.foldLeft(train -> test) { case ((currTrain, currTest), stagesLayer) => + val index = stagesLayer.head._2 + val FittedDAG(newTrain, newTest, justFitted) = fitAndTransformLayer( + stagesLayer = stagesLayer, + train = currTrain, + test = currTest, + hasTest = hasTest, + transformData = indexOfLastEstimator.exists(_ < index), // only need to update for fit before last estimator + persistEveryKStages = persistEveryKStages + ) + alreadyFitted ++= justFitted + newTrain -> newTest + } + + FittedDAG(newTrain, newTest, alreadyFitted.toArray) + } /** * Fit a sequence of stages and transform a training and test dataset for use this function assumes all * stages fed in are on the same level of the dag - * @param train training dataset for estimators - * @param test test dataset for evaluation - * @param stages stages to fix - * @param transformData should the imput data be transformed or only used for fitting + * + * @param train training dataset for estimators + * @param test test dataset for evaluation + * @param hasTest whether the test dataset is empty or not + * @param stagesLayer stages to fit + * @param transformData should the input data be transformed or only used for fitting * @param persistEveryKStages persist data at this frequency during transformations - * @param doTest test data is nonempty * @return dataframes for train and test as well as the fitted stages */ - def fitAndTransform( - train: DataFrame, - test: DataFrame, - stages: Array[(OPStage)], + private def fitAndTransformLayer( + stagesLayer: Layer, + train: Dataset[Row], + test: Dataset[Row], + hasTest: Boolean, transformData: Boolean, - persistEveryKStages: Int, - doTest: Option[Boolean] = None - )(implicit spark: SparkSession, log: Logger): (DataFrame, DataFrame, Array[OPStage]) = { - - val testExists = doTest.getOrElse(!test.isEmpty) - val (estimators, noFit) = stages.partition( _.isInstanceOf[Estimator[_]] ) + persistEveryKStages: Int + )(implicit spark: SparkSession): FittedDAG = { + val stages = stagesLayer.map(_._1) + val (estimators, noFit) = stages.partition(_.isInstanceOf[Estimator[_]]) val fitEstimators = estimators.map { case e: Estimator[_] => e.fit(train) match { - case m: HasTestEval if testExists => m.evaluateModel(test) + case m: HasTestEval if hasTest => + m.evaluateModel(test) + m.asInstanceOf[OPStage] + case m => m.asInstanceOf[OPStage] - case m => m.asInstanceOf[OPStage] } } val transformers = noFit ++ fitEstimators @@ -160,16 +281,92 @@ private[op] case object FitStagesUtil { val withOPTrain = applyOpTransformations(opTransformers, train) val withAllTrain = applySparkTransformations(withOPTrain, sparkTransformers, persistEveryKStages) - val withAllTest = if (testExists) { + val withAllTest = if (hasTest) { val withOPTest = applyOpTransformations(opTransformers, test) applySparkTransformations(withOPTest, sparkTransformers, persistEveryKStages) } else test - (withAllTrain, withAllTest, transformers) + FittedDAG(trainData = withAllTrain, testData = withAllTest, transformers = transformers) } else { - (train, test, transformers) + FittedDAG(trainData = train, testData = test, transformers = transformers) + } + } + + /** + * Method that cut DAG in order to perform proper CV/TS. + * Extracts Model Selector and Split the DAG into + * 1. DAG before CV/TS + * 2. DAG during CV/TS + * 3. DAG after CV/TS + * + * @param dag DAG in the workflow to be cut + * @return (Model Selector, nonCVTS DAG -to be done outside of CV/TS, CVTS DAG -to apply in the CV/TS) + */ + def cutDAG(dag: StagesDAG): CutDAG = { + if (dag.isEmpty) CutDAG(None, Array(), Array(), Array()) + else { + // creates Array containing every Model Selector in the DAG + val modelSelectorArrays = dag.flatten.collect { case (ms: MS, dist: Int) => (ms, dist) } + val modelSelector = modelSelectorArrays.toList match { + case Nil => None + case List(ms) => Option(ms) + case modelSelectors => throw new IllegalArgumentException( + s"OpWorkflow can contain at most 1 Model Selector. Found ${modelSelectors.length} Model Selectors :" + + s" ${modelSelectors.map(_._1).mkString(",")}") + } + + // nonCVTS and CVTS DAGs + val (nonCVTSDAG: StagesDAG, inCVTSDAG: StagesDAG, afterCVTSDAG: StagesDAG) = + modelSelector.map { case (ms, dist) => + // Optimize the DAG by removing stages unrelated to ModelSelector + + // Create the DAG after Model Selector. + val (afterCVTSDAG, beforeCVDAG) = dag.partition(_.exists(_._2 < dist)) + + val modelSelectorDAG = computeDAG(Array(ms.getOutput())) + .dropRight(1) + .map(_.map{ case (stage, dist) => (stage, dist + afterCVTSDAG.length) }) + + // Create the DAG without Model Selector. It will be used to compute the final nonCVTS DAG. + val nonMSDAG: StagesDAG = beforeCVDAG.map(_.filterNot(_._1.isInstanceOf[MS])).filter(_.nonEmpty) + + // Index of first CVTS stage in ModelSelector DAG + val firstCVTSIndex = modelSelectorDAG.indexWhere(_.exists(stage => { + val inputs = stage._1.getTransientFeatures() + inputs.exists(_.isResponse) && inputs.exists(!_.isResponse) + })) + + // If no CVTS stages, the whole DAG is not in the CV/TS + if (firstCVTSIndex == -1) (nonMSDAG, Array.empty[Layer], afterCVTSDAG) else { + + val cVTSDAG = modelSelectorDAG.drop(firstCVTSIndex) + + // nonCVTSDAG is the complementary DAG + // The rule is "nonCVTSDAG = nonMSDAG - CVTSDAG" + val nonCVTSDAG = { + val flattenedCVTSDAG = cVTSDAG.flatten.map(_._1) + nonMSDAG.map(_.filterNot { case (stage: OPStage, _) => flattenedCVTSDAG.contains(stage) }) + .filter(_.nonEmpty) // Remove empty layers + } + + (nonCVTSDAG, cVTSDAG, afterCVTSDAG) + } + }.getOrElse((Array.empty[Layer], Array.empty[Layer], Array.empty[Layer])) + + CutDAG(modelSelector, before = nonCVTSDAG, during = inCVTSDAG, after = afterCVTSDAG) } } + /** + * Method that cut DAG in order to perform proper CV/TS. + * Extracts Model Selector and Split the DAG into + * 1. DAG before CV/TS + * 2. DAG during CV/TS + * 3. DAG after CV/TS + * + * @param wf to be cut + * @return (Model Selector, nonCVTS DAG -to be done outside of CV/TS, CVTS DAG -to apply in the CV/TS) + */ + def cutDAG(wf: OpWorkflow): CutDAG = cutDAG(computeDAG(wf.getResultFeatures())) } diff --git a/core/src/main/scala/com/salesforce/op/utils/text/LuceneTextAnalyzer.scala b/core/src/main/scala/com/salesforce/op/utils/text/LuceneTextAnalyzer.scala index 4d4a76800c..57207290b7 100644 --- a/core/src/main/scala/com/salesforce/op/utils/text/LuceneTextAnalyzer.scala +++ b/core/src/main/scala/com/salesforce/op/utils/text/LuceneTextAnalyzer.scala @@ -32,13 +32,18 @@ package com.salesforce.op.utils.text import java.io.Reader +import java.nio.charset.StandardCharsets import com.salesforce.op.utils.text.Language._ +import org.apache.lucene.analysis._ import org.apache.lucene.analysis.ar.ArabicAnalyzer import org.apache.lucene.analysis.bg.BulgarianAnalyzer +import org.apache.lucene.analysis.bn.BengaliAnalyzer +import org.apache.lucene.analysis.br.BrazilianAnalyzer import org.apache.lucene.analysis.ca.CatalanAnalyzer import org.apache.lucene.analysis.charfilter.HTMLStripCharFilter import org.apache.lucene.analysis.cjk.CJKAnalyzer +import org.apache.lucene.analysis.ckb.SoraniAnalyzer import org.apache.lucene.analysis.cz.CzechAnalyzer import org.apache.lucene.analysis.da.DanishAnalyzer import org.apache.lucene.analysis.de.GermanAnalyzer @@ -63,12 +68,13 @@ import org.apache.lucene.analysis.no.NorwegianAnalyzer import org.apache.lucene.analysis.pt.PortugueseAnalyzer import org.apache.lucene.analysis.ro.RomanianAnalyzer import org.apache.lucene.analysis.ru.RussianAnalyzer +import org.apache.lucene.analysis.snowball.SnowballFilter import org.apache.lucene.analysis.standard.StandardAnalyzer import org.apache.lucene.analysis.sv.SwedishAnalyzer import org.apache.lucene.analysis.th.ThaiAnalyzer import org.apache.lucene.analysis.tokenattributes.CharTermAttribute import org.apache.lucene.analysis.tr.TurkishAnalyzer -import org.apache.lucene.analysis.{Analyzer, AnalyzerWrapper, TokenStream} +import org.apache.lucene.util.IOUtils import scala.collection.mutable.ArrayBuffer @@ -118,21 +124,28 @@ class LuceneTextAnalyzer */ object LuceneTextAnalyzer { + private val englishStopwords = WordlistLoader.getSnowballWordSet( + IOUtils.getDecodingReader(classOf[SnowballFilter], "english_stop.txt", StandardCharsets.UTF_8) + ) + /** * Default analyzer to use if a language specific one is not present */ - val DefaultAnalyzer: Analyzer = new StandardAnalyzer() + val DefaultAnalyzer: Analyzer = new StandardAnalyzer(englishStopwords) // TODO we should add specific analyzers per each language if possible private val analyzers: Map[Language, Analyzer] = Map( Arabic -> new ArabicAnalyzer(), - Catalan -> new CatalanAnalyzer(), Bulgarian -> new BulgarianAnalyzer(), + Bengali -> new BengaliAnalyzer(), + Brazilian -> new BrazilianAnalyzer(), + Catalan -> new CatalanAnalyzer(), + Sorani -> new SoraniAnalyzer(), Czech -> new CzechAnalyzer(), Danish -> new DanishAnalyzer(), German -> new GermanAnalyzer(), Greek -> new GreekAnalyzer(), - English -> new EnglishAnalyzer(), + English -> new EnglishAnalyzer(englishStopwords), Spanish -> new SpanishAnalyzer(), Basque -> new BasqueAnalyzer(), Persian -> new PersianAnalyzer(), @@ -145,7 +158,7 @@ object LuceneTextAnalyzer { Indonesian -> new IndonesianAnalyzer(), Italian -> new ItalianAnalyzer(), Japanese -> new JapaneseAnalyzer(), - Korean -> new CJKAnalyzer(), + Korean -> new CJKAnalyzer(englishStopwords), Lithuanian -> new LithuanianAnalyzer(), Latvian -> new LatvianAnalyzer(), Dutch -> new DutchAnalyzer(), @@ -156,8 +169,8 @@ object LuceneTextAnalyzer { Swedish -> new SwedishAnalyzer(), Thai -> new ThaiAnalyzer(), Turkish -> new TurkishAnalyzer(), - SimplifiedChinese -> new CJKAnalyzer(), - TraditionalChinese -> new CJKAnalyzer() + SimplifiedChinese -> new CJKAnalyzer(englishStopwords), + TraditionalChinese -> new CJKAnalyzer(englishStopwords) ) private val defaultAnalyzerHtmlStrip = stripHtml(DefaultAnalyzer) diff --git a/core/src/main/scala/org/apache/spark/ml/regression/OpRandomForestRegressionModel.scala b/core/src/main/scala/com/salesforce/op/utils/text/OpenNLPAnalyzer.scala similarity index 70% rename from core/src/main/scala/org/apache/spark/ml/regression/OpRandomForestRegressionModel.scala rename to core/src/main/scala/com/salesforce/op/utils/text/OpenNLPAnalyzer.scala index 298002ed45..a1a8d31418 100644 --- a/core/src/main/scala/org/apache/spark/ml/regression/OpRandomForestRegressionModel.scala +++ b/core/src/main/scala/com/salesforce/op/utils/text/OpenNLPAnalyzer.scala @@ -29,24 +29,22 @@ * POSSIBILITY OF SUCH DAMAGE. */ -package org.apache.spark.ml.regression +package com.salesforce.op.utils.text -import com.salesforce.op.UID -import com.salesforce.op.features.types.{OPVector, Prediction, RealMap, RealNN} +import opennlp.tools.namefind.TokenNameFinderModel +import opennlp.tools.tokenize.TokenizerME -import scala.reflect.runtime.universe.TypeTag +/** + * OpenNLP text analyzer to apply when applying Open NLP + * [[TokenNameFinderModel]] + */ +class OpenNLPAnalyzer extends TextAnalyzer { + def analyze(s: String, language: Language): Seq[String] = { + OpenNLPModels.getTokenizerModel(language) match { + case Some(tokenizerModel) => new TokenizerME(tokenizerModel).tokenize(s) + case _ => Seq(s) + } + } -class OpRandomForestRegressionModel -( - val treesIn: Array[DecisionTreeRegressionModel], - numFeatures: Int, - uid: String = UID[OpRandomForestRegressionModel], - val operationName: String = "opRFR" -)( - implicit val tti1: TypeTag[RealNN], - val tti2: TypeTag[OPVector], - val tto: TypeTag[Prediction], - val ttov: TypeTag[Prediction#Value] -) extends RandomForestRegressionModel(uid = uid, _trees = treesIn, numFeatures = numFeatures) - with OpPredictionModelBase +} diff --git a/core/src/main/scala/com/salesforce/op/utils/text/OpenNLPModels.scala b/core/src/main/scala/com/salesforce/op/utils/text/OpenNLPModels.scala new file mode 100644 index 0000000000..50ffccc7f8 --- /dev/null +++ b/core/src/main/scala/com/salesforce/op/utils/text/OpenNLPModels.scala @@ -0,0 +1,136 @@ +/* + * Copyright (c) 2017, Salesforce.com, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of Salesforce.com nor the names of its contributors may + * be used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +package com.salesforce.op.utils.text + +import java.io.InputStream + +import com.salesforce.op.utils.text.Language._ +import com.salesforce.op.utils.text.NameEntityType._ +import opennlp.tools.namefind.TokenNameFinderModel +import opennlp.tools.sentdetect.SentenceModel +import opennlp.tools.tokenize.TokenizerModel + +/** + * A factory to get/create OpenNLP models + */ +object OpenNLPModels { + // Assumes that models are stored as a resource + private val modelsPath = "/OpenNLP" + + private lazy val tokenNameModels: Map[(Language, NameEntityType), TokenNameFinderModel] = Map( + (English, Date) -> loadTokenNameFinderModel(s"$modelsPath/en-ner-date.bin"), + (English, Location) -> loadTokenNameFinderModel(s"$modelsPath/en-ner-location.bin"), + (English, Money) -> loadTokenNameFinderModel(s"$modelsPath/en-ner-money.bin"), + (English, Organization) -> loadTokenNameFinderModel(s"$modelsPath/en-ner-organization.bin"), + (English, Percentage) -> loadTokenNameFinderModel(s"$modelsPath/en-ner-percentage.bin"), + (English, Person) -> loadTokenNameFinderModel(s"$modelsPath/en-ner-person.bin"), + (English, Time) -> loadTokenNameFinderModel(s"$modelsPath/en-ner-time.bin"), + + (Spanish, Location) -> loadTokenNameFinderModel(s"$modelsPath/es-ner-location.bin"), + (Spanish, Organization) -> loadTokenNameFinderModel(s"$modelsPath/es-ner-organization.bin"), + (Spanish, Person) -> loadTokenNameFinderModel(s"$modelsPath/es-ner-person.bin"), + (Spanish, Misc) -> loadTokenNameFinderModel(s"$modelsPath/es-ner-misc.bin"), + + (Dutch, Location) -> loadTokenNameFinderModel(s"$modelsPath/nl-ner-location.bin"), + (Dutch, Organization) -> loadTokenNameFinderModel(s"$modelsPath/nl-ner-organization.bin"), + (Dutch, Person) -> loadTokenNameFinderModel(s"$modelsPath/nl-ner-person.bin"), + (Dutch, Misc) -> loadTokenNameFinderModel(s"$modelsPath/nl-ner-misc.bin") + ) + + private lazy val sentenceModels: Map[Language, SentenceModel] = Map( + Danish -> loadSentenceModel(s"$modelsPath/da-sent.bin"), + English -> loadSentenceModel(s"$modelsPath/en-sent.bin"), + German -> loadSentenceModel(s"$modelsPath/de-sent.bin"), + Dutch -> loadSentenceModel(s"$modelsPath/nl-sent.bin"), + Portuguese -> loadSentenceModel(s"$modelsPath/pt-sent.bin"), + Sami -> loadSentenceModel(s"$modelsPath/se-sent.bin") + ) + + private lazy val tokenizerModels: Map[Language, TokenizerModel] = Map( + Danish -> loadTokenizerModel(s"$modelsPath/da-token.bin"), + German -> loadTokenizerModel(s"$modelsPath/de-token.bin"), + English -> loadTokenizerModel(s"$modelsPath/en-token.bin"), + Dutch -> loadTokenizerModel(s"$modelsPath/nl-token.bin"), + Portuguese -> loadTokenizerModel(s"$modelsPath/pt-token.bin"), + Sami -> loadTokenizerModel(s"$modelsPath/se-token.bin") + ) + + /** + * Factory to get [[TokenNameFinderModel]] for a given language & entity type if it exists + * + * @return some [[TokenNameFinderModel]] instance or None + */ + def getTokenNameFinderModel(language: Language, entity: NameEntityType): Option[TokenNameFinderModel] = + tokenNameModels.get(language -> entity) + + /** + * Factory to get [[SentenceModel]] for a given language + * + * @return some [[SentenceModel]] instance or None + */ + def getSentenceModel(language: Language): Option[SentenceModel] = + sentenceModels.get(language) + + /** + * Factory to get [[TokenizerModel]] for a given language + * + * @return some [[TokenizerModel]] instance or None + */ + def getTokenizerModel(language: Language): Option[TokenizerModel] = + tokenizerModels.get(language) + + private def loadTokenNameFinderModel(resourcePath: String): TokenNameFinderModel = { + val modelStream = loadFromResource(resourcePath) + new TokenNameFinderModel(modelStream) + } + + private def loadSentenceModel(resourcePath: String): SentenceModel = { + val modelStream = loadFromResource(resourcePath) + new SentenceModel(modelStream) + } + + private def loadTokenizerModel(resourcePath: String): TokenizerModel = { + val modelStream = loadFromResource(resourcePath) + new TokenizerModel(modelStream) + } + + private def loadFromResource(resourcePath: String): InputStream = + try { + getClass.getResourceAsStream(resourcePath) + } catch { + case e: Exception => throw new RuntimeException( + s"Failed to load OpenNLP model from resource '$resourcePath'. " + + "Make sure to include OP 'models' dependency jar in your application classpath.", e + ) + } + +} diff --git a/core/src/main/scala/com/salesforce/op/utils/text/OpenNLPNameEntityTagger.scala b/core/src/main/scala/com/salesforce/op/utils/text/OpenNLPNameEntityTagger.scala new file mode 100644 index 0000000000..11c2a51023 --- /dev/null +++ b/core/src/main/scala/com/salesforce/op/utils/text/OpenNLPNameEntityTagger.scala @@ -0,0 +1,115 @@ +/* + * Copyright (c) 2017, Salesforce.com, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of Salesforce.com nor the names of its contributors may + * be used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +package com.salesforce.op.utils.text + +import com.salesforce.op.utils.text.NameEntityType._ +import com.twitter.algebird.Monoid._ +import com.twitter.algebird.Operators._ +import opennlp.tools.namefind.NameFinderME +import opennlp.tools.util.Span + +/** + * OpenNLP implementation of [[NameEntityTagger]] + */ +class OpenNLPNameEntityTagger extends NameEntityTagger[OpenNLPTagResult] { + + /** + * Apply the name entity recognition model on the sentence tokens to retrieve information + * + * @param tokens sentence tokens + * @param language language + * @param entitiesToTag entities to tag if found + * @return map of entity and corresponding tokens + */ + def tag( + tokens: Seq[String], + language: Language, + entitiesToTag: Seq[NameEntityType] + ): OpenNLPTagResult = { + val tokensArr = tokens.toArray + val empty = Map.empty[String, Set[NameEntityType]] + val tags = entitiesToTag.foldLeft(empty) { (acc, entityToTag) => + OpenNLPModels.getTokenNameFinderModel(language, entityToTag) match { + case None => acc + case Some(model) => + val finder = new NameFinderME(model) + val spans = finder.find(tokensArr) + val res = convertSpansToMap(spans, tokensArr) + acc + res + } + } + OpenNLPTagResult(tags) + } + + /** + * Retrieve information from the model output + * + * @param spans open nlp name entity finder model output + * @param tokens sentence tokens + * @return map of token and its tag set + */ + private[op] def convertSpansToMap(spans: Seq[Span], tokens: Array[String]): Map[String, Set[NameEntityType]] = { + // span objects provide exclusive end index + val pairSeq = for { + span <- spans + entity = Seq(nameEntityType(span.getType.toLowerCase)) + token <- tokens.slice(span.getStart, span.getEnd) + } yield token -> entity + + // aggregate results by token convert the output to map + pairSeq + .groupBy { case (token, _) => token } + .map { case (token, entities) => + token -> entities.flatMap(_._2).toSet + } + } + + private def nameEntityType: String => NameEntityType = { + case "date" => Date + case "location" => Location + case "money" => Money + case "organization" => Organization + case "percentage" => Percentage + case "person" => Person + case "time" => Time + case "misc" => Misc + case _ => Other + } +} + + +/** + * OpenNLP implementation of [[TaggerResult]] + * + * @param tokenTags token tags map, where keys are token and values are entities matching each token + */ +case class OpenNLPTagResult(tokenTags: Map[String, Set[NameEntityType]]) extends TaggerResult diff --git a/core/src/main/scala/org/apache/spark/ml/regression/OpLinearPredictionModel.scala b/core/src/main/scala/com/salesforce/op/utils/text/OpenNLPSentenceSplitter.scala similarity index 70% rename from core/src/main/scala/org/apache/spark/ml/regression/OpLinearPredictionModel.scala rename to core/src/main/scala/com/salesforce/op/utils/text/OpenNLPSentenceSplitter.scala index 1c79a2d0bf..d9d9bf68c7 100644 --- a/core/src/main/scala/org/apache/spark/ml/regression/OpLinearPredictionModel.scala +++ b/core/src/main/scala/com/salesforce/op/utils/text/OpenNLPSentenceSplitter.scala @@ -29,24 +29,20 @@ * POSSIBILITY OF SUCH DAMAGE. */ -package org.apache.spark.ml.regression +package com.salesforce.op.utils.text -import com.salesforce.op.UID -import com.salesforce.op.features.types.{OPVector, Prediction, RealMap, RealNN} -import org.apache.spark.ml.linalg.Vector +import opennlp.tools.sentdetect.SentenceDetectorME -import scala.reflect.runtime.universe.TypeTag +/** + * Implementation of [[SentenceSplitter]] using Open NLP sentence splitter + */ +class OpenNLPSentenceSplitter extends SentenceSplitter { + + def getSentences(input: String, language: Language): Seq[String] = { + OpenNLPModels.getSentenceModel(language) match { + case Some(sentenceModel) => new SentenceDetectorME(sentenceModel).sentDetect(input) + case None => Seq(input) // sequence of original input + } + } -class OpLinearPredictionModel -( - coefficients: Vector, - intercept: Double, - uid: String = UID[OpLinearPredictionModel], - val operationName: String = "opLP" -)( - implicit val tti1: TypeTag[RealNN], - val tti2: TypeTag[OPVector], - val tto: TypeTag[Prediction], - val ttov: TypeTag[Prediction#Value] -) extends LinearRegressionModel(uid = uid, coefficients = coefficients, intercept = intercept) - with OpPredictionModelBase +} diff --git a/core/src/main/scala/com/salesforce/op/utils/text/OptimaizeLanguageDetector.scala b/core/src/main/scala/com/salesforce/op/utils/text/OptimaizeLanguageDetector.scala index 40ade0815a..f680dba202 100644 --- a/core/src/main/scala/com/salesforce/op/utils/text/OptimaizeLanguageDetector.scala +++ b/core/src/main/scala/com/salesforce/op/utils/text/OptimaizeLanguageDetector.scala @@ -31,10 +31,10 @@ package com.salesforce.op.utils.text +import com.optimaize.langdetect.LanguageDetectorBuilder import com.optimaize.langdetect.i18n.LdLocale import com.optimaize.langdetect.ngram.NgramExtractors import com.optimaize.langdetect.profiles.LanguageProfileReader -import com.optimaize.langdetect.{LanguageDetectorBuilder, LanguageDetector => OLanguageDetector} import org.slf4j.LoggerFactory import scala.collection.JavaConverters._ diff --git a/core/src/main/scala/org/apache/spark/ml/SparkModelConverter.scala b/core/src/main/scala/org/apache/spark/ml/SparkModelConverter.scala deleted file mode 100644 index fa76f238d3..0000000000 --- a/core/src/main/scala/org/apache/spark/ml/SparkModelConverter.scala +++ /dev/null @@ -1,70 +0,0 @@ -/* - * Copyright (c) 2017, Salesforce.com, Inc. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - * - * 3. Neither the name of Salesforce.com nor the names of its contributors may - * be used to endorse or promote products derived from this software without - * specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ - -package org.apache.spark.ml - -import com.salesforce.op.features.types.{OPVector, Prediction, RealNN} -import com.salesforce.op.stages.base.binary.OpTransformer2 -import org.apache.spark.ml.classification._ -import org.apache.spark.ml.regression._ - -/** - * Allows conversion from spark models to models that follow the OP convention of having a - * transformFn that can be called on a single row rather than the whole dataframe - */ -object SparkModelConverter { - - def toOP[T <: Transformer]( - model: Option[T], - isMultinomial: Boolean = false - ): OpTransformer2[RealNN, OPVector, Prediction] = { - model match { - case None => throw new RuntimeException("no model found") - case Some(m: LogisticRegressionModel) => - new OpLogisticRegressionModel(m.coefficientMatrix, m.interceptVector, m.numClasses, isMultinomial) - case Some(m: RandomForestClassificationModel) => - new OpRandomForestClassificationModel(m.trees, m.numFeatures, m.numClasses) - case Some(m: NaiveBayesModel) => - new OpNaiveBayesModel(m.pi, m.theta, m.oldLabels, if (isMultinomial) "multinomial" else "bernoulli") - case Some(m: DecisionTreeClassificationModel) => - new OpDecisionTreeClassificationModel(m.rootNode, m.numFeatures, m.numClasses) - case Some(m: LinearRegressionModel) => - new OpLinearPredictionModel(m.coefficients, m.intercept) - case Some(m: RandomForestRegressionModel) => - new OpRandomForestRegressionModel(m.trees, m.numFeatures) - case Some(m: GBTRegressionModel) => - new OpGBTRegressionModel(m.trees, m.treeWeights, m.numFeatures) - case Some(m: DecisionTreeRegressionModel) => - new OpDecisionTreeRegressionModel(m.rootNode, m.numFeatures) - case m => throw new RuntimeException(s"model conversion not implemented for model $m") - } - } -} diff --git a/core/src/main/scala/org/apache/spark/ml/classification/OpClassifierModelBase.scala b/core/src/main/scala/org/apache/spark/ml/classification/ClassifierParams.scala similarity index 71% rename from core/src/main/scala/org/apache/spark/ml/classification/OpClassifierModelBase.scala rename to core/src/main/scala/org/apache/spark/ml/classification/ClassifierParams.scala index e363b07868..2434571eea 100644 --- a/core/src/main/scala/org/apache/spark/ml/classification/OpClassifierModelBase.scala +++ b/core/src/main/scala/org/apache/spark/ml/classification/ClassifierParams.scala @@ -31,25 +31,19 @@ package org.apache.spark.ml.classification -import com.salesforce.op.features.types.{OPVector, Prediction, RealNN} -import com.salesforce.op.stages.base.binary.OpTransformer2 -import org.apache.spark.ml.linalg.Vector +import org.apache.spark.ml.tree.{DecisionTreeClassifierParams, GBTClassifierParams, RandomForestClassifierParams} -trait OpClassifierModelBase extends OpTransformer2[RealNN, OPVector, Prediction] { +trait OpDecisionTreeClassifierParams extends DecisionTreeClassifierParams - self: ProbabilisticClassificationModel[Vector, _] => +trait OpGBTClassifierParams extends GBTClassifierParams +trait OpLinearSVCParams extends LinearSVCParams - /** - * Function used to convert input to output - */ - override def transformFn: (RealNN, OPVector) => Prediction = (label, features) => { - val raw = predictRaw(features.value) - val prob = raw2probability(raw) - val pred = probability2prediction(prob) +trait OpLogisticRegressionParams extends LogisticRegressionParams - Prediction(rawPrediction = raw, probability = prob, prediction = pred) - } +trait OpMultilayerPerceptronClassifierParams extends MultilayerPerceptronParams -} +trait OpNaiveBayesParams extends NaiveBayesParams + +trait OpRandomForestClassifierParams extends RandomForestClassifierParams with ProbabilisticClassifierParams diff --git a/core/src/main/scala/org/apache/spark/ml/classification/OpRandomForestClassifierModel.scala b/core/src/main/scala/org/apache/spark/ml/classification/OpRandomForestClassifierModel.scala deleted file mode 100644 index e2f22dd2a4..0000000000 --- a/core/src/main/scala/org/apache/spark/ml/classification/OpRandomForestClassifierModel.scala +++ /dev/null @@ -1,52 +0,0 @@ -/* - * Copyright (c) 2017, Salesforce.com, Inc. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - * - * 3. Neither the name of Salesforce.com nor the names of its contributors may - * be used to endorse or promote products derived from this software without - * specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ - -package org.apache.spark.ml.classification - -import com.salesforce.op.UID -import com.salesforce.op.features.types.{OPVector, Prediction, RealMap, RealNN} - -import scala.reflect.runtime.universe.TypeTag - -class OpRandomForestClassificationModel -( - val treesIn: Array[DecisionTreeClassificationModel], - numFeatures: Int, - numClasses: Int, - uid: String = UID[OpRandomForestClassificationModel], - val operationName: String = "opRF" -)( - implicit val tti1: TypeTag[RealNN], - val tti2: TypeTag[OPVector], - val tto: TypeTag[Prediction], - val ttov: TypeTag[Prediction#Value] -) extends RandomForestClassificationModel(uid = uid, _trees = treesIn, numFeatures = numFeatures, - numClasses = numClasses) with OpClassifierModelBase diff --git a/core/src/main/scala/org/apache/spark/ml/regression/OpDecisionTreeRegressionModel.scala b/core/src/main/scala/org/apache/spark/ml/regression/OpDecisionTreeRegressionModel.scala deleted file mode 100644 index 61f60ebcf0..0000000000 --- a/core/src/main/scala/org/apache/spark/ml/regression/OpDecisionTreeRegressionModel.scala +++ /dev/null @@ -1,52 +0,0 @@ -/* - * Copyright (c) 2017, Salesforce.com, Inc. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - * - * 3. Neither the name of Salesforce.com nor the names of its contributors may - * be used to endorse or promote products derived from this software without - * specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ - -package org.apache.spark.ml.regression - -import com.salesforce.op.UID -import com.salesforce.op.features.types.{OPVector, Prediction, RealMap, RealNN} -import org.apache.spark.ml.tree.Node - -import scala.reflect.runtime.universe.TypeTag - -class OpDecisionTreeRegressionModel -( - rootNode: Node, - numFeatures: Int, - uid: String = UID[OpDecisionTreeRegressionModel], - val operationName: String = "opDTR" -)( - implicit val tti1: TypeTag[RealNN], - val tti2: TypeTag[OPVector], - val tto: TypeTag[Prediction], - val ttov: TypeTag[Prediction#Value] -) extends DecisionTreeRegressionModel(uid = uid, rootNode = rootNode, numFeatures = numFeatures) - with OpPredictionModelBase diff --git a/core/src/main/scala/org/apache/spark/ml/regression/OpGBTRegressionModel.scala b/core/src/main/scala/org/apache/spark/ml/regression/OpGBTRegressionModel.scala deleted file mode 100644 index 2a9ae09d51..0000000000 --- a/core/src/main/scala/org/apache/spark/ml/regression/OpGBTRegressionModel.scala +++ /dev/null @@ -1,52 +0,0 @@ -/* - * Copyright (c) 2017, Salesforce.com, Inc. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - * - * 3. Neither the name of Salesforce.com nor the names of its contributors may - * be used to endorse or promote products derived from this software without - * specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ - -package org.apache.spark.ml.regression - -import com.salesforce.op.UID -import com.salesforce.op.features.types.{OPVector, Prediction, RealMap, RealNN} - -import scala.reflect.runtime.universe.TypeTag - -class OpGBTRegressionModel -( - val treesIn: Array[DecisionTreeRegressionModel], - val treeWeightsIn: Array[Double], - numFeatures: Int, - uid: String = UID[OpGBTRegressionModel], - val operationName: String = "opGBTR" -)( - implicit val tti1: TypeTag[RealNN], - val tti2: TypeTag[OPVector], - val tto: TypeTag[Prediction], - val ttov: TypeTag[Prediction#Value] -) extends GBTRegressionModel(uid = uid, _trees = treesIn, _treeWeights = treeWeightsIn, numFeatures = numFeatures) - with OpPredictionModelBase diff --git a/core/src/main/scala/org/apache/spark/ml/regression/OpPredictionModelBase.scala b/core/src/main/scala/org/apache/spark/ml/regression/RegressorParams.scala similarity index 75% rename from core/src/main/scala/org/apache/spark/ml/regression/OpPredictionModelBase.scala rename to core/src/main/scala/org/apache/spark/ml/regression/RegressorParams.scala index 5ec106d757..6f08a7fdf9 100644 --- a/core/src/main/scala/org/apache/spark/ml/regression/OpPredictionModelBase.scala +++ b/core/src/main/scala/org/apache/spark/ml/regression/RegressorParams.scala @@ -31,18 +31,15 @@ package org.apache.spark.ml.regression -import com.salesforce.op.features.types.{OPVector, Prediction, RealNN} -import com.salesforce.op.stages.base.binary.OpTransformer2 -import org.apache.spark.ml.PredictionModel -import org.apache.spark.ml.linalg.Vector +import org.apache.spark.ml.tree.{DecisionTreeRegressorParams, GBTRegressorParams, RandomForestRegressorParams} -trait OpPredictionModelBase extends OpTransformer2[RealNN, OPVector, Prediction] { - self: PredictionModel[Vector, _] => +trait OpDecisionTreeRegressorParams extends DecisionTreeRegressorParams - /** - * Function used to convert input to output - */ - override def transformFn: (RealNN, OPVector) => Prediction = (label, features) => - Prediction(prediction = predict(features.value)) +trait OpLinearRegressionParams extends LinearRegressionParams + +trait OpGBTRegressorParams extends GBTRegressorParams + +trait OpGeneralizedLinearRegressionParams extends GeneralizedLinearRegressionBase + +trait OpRandomForestRegressorParams extends RandomForestRegressorParams -} diff --git a/core/src/test/avro/PassengerDataAll.avsc b/core/src/test/avro/PassengerDataAll.avsc new file mode 100644 index 0000000000..031dd116e6 --- /dev/null +++ b/core/src/test/avro/PassengerDataAll.avsc @@ -0,0 +1,43 @@ +{ + "type" : "record", + "name" : "PassengerDataAll", + "namespace" : "com.salesforce.app.schema", + "fields" : [ { + "name" : "PassengerId", + "type" : [ "int", "null" ] + }, { + "name" : "Survived", + "type" : "int", + "default": 0 + }, { + "name" : "Pclass", + "type" : [ "int", "null" ] + }, { + "name" : "Name", + "type" : [ "string", "null" ] + }, { + "name" : "Sex", + "type" : [ "string", "null" ] + }, { + "name" : "Age", + "type" : [ "double", "null" ] + }, { + "name" : "SibSp", + "type" : [ "int", "null" ] + }, { + "name" : "Parch", + "type" : [ "int", "null" ] + }, { + "name" : "Ticket", + "type" : [ "string", "null" ] + }, { + "name" : "Fare", + "type" : [ "double", "null" ] + }, { + "name" : "Cabin", + "type" : [ "string", "null" ] + }, { + "name" : "Embarked", + "type" : [ "string", "null" ] + } ] +} diff --git a/core/src/test/resources/log4j.properties b/core/src/test/resources/log4j.properties index f178bf8434..a2162e650f 100644 --- a/core/src/test/resources/log4j.properties +++ b/core/src/test/resources/log4j.properties @@ -19,3 +19,9 @@ log4j.logger.com.databricks.spark.avro=WARN # Optimus Prime logging log4j.logger.com.salesforce.op=ERROR log4j.logger.com.salesforce.op.utils.spark.OpSparkListener=OFF + +# Breeze +log4j.logger.breeze.optimize=ERROR + +# BLAS & LAPACK +log4j.logger.com.github.fommil.netlib=ERROR diff --git a/core/src/test/scala/com/salesforce/op/OpWorkflowCVTest.scala b/core/src/test/scala/com/salesforce/op/OpWorkflowCVTest.scala new file mode 100644 index 0000000000..6f7ee25684 --- /dev/null +++ b/core/src/test/scala/com/salesforce/op/OpWorkflowCVTest.scala @@ -0,0 +1,322 @@ +/* + * Copyright (c) 2017, Salesforce.com, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of Salesforce.com nor the names of its contributors may + * be used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +package com.salesforce.op + +import com.salesforce.app.schema.PassengerDataAll +import com.salesforce.op.evaluators._ +import com.salesforce.op.features._ +import com.salesforce.op.features.types._ +import com.salesforce.op.readers._ +import com.salesforce.op.stages.base.binary.BinaryTransformer +import com.salesforce.op.stages.impl.classification.ClassificationModelsToTry._ +import com.salesforce.op.stages.impl.classification._ +import com.salesforce.op.stages.impl.preparators.SanityChecker +import com.salesforce.op.stages.impl.regression.{LossType, RegressionModelSelector, RegressionModelsToTry} +import com.salesforce.op.stages.impl.selector.{ModelSelectorBase, ModelSelectorBaseNames} +import com.salesforce.op.stages.impl.tuning._ +import com.salesforce.op.test.PassengerSparkFixtureTest +import org.apache.spark.ml.PipelineStage +import org.apache.spark.ml.linalg.Vector +import org.apache.spark.sql.DataFrame +import org.junit.runner.RunWith +import org.scalatest.FlatSpec +import org.scalatest.junit.JUnitRunner +import org.slf4j.LoggerFactory + +@RunWith(classOf[JUnitRunner]) +class OpWorkflowCVTest extends FlatSpec with PassengerSparkFixtureTest { + + val log = LoggerFactory.getLogger(this.getClass) + + trait PassenserCSVforCV { + val simplePassengerForCV = DataReaders.Simple.csv[PassengerDataAll]( + path = Some(s"$testDataPath/PassengerDataAll.csv"), + schema = PassengerDataAll.getClassSchema.toString, + key = _.getPassengerId.toString + ) + val age = FeatureBuilder.Real[PassengerDataAll].extract(_.getAge.toReal).asPredictor + val sex = FeatureBuilder.PickList[PassengerDataAll].extract(_.getSex.toPickList).asPredictor + val fair = FeatureBuilder.Real[PassengerDataAll].extract(p => Option(p.getFare).map(_.toDouble).toReal).asPredictor + val pClass = FeatureBuilder.PickList[PassengerDataAll].extract(_.getPclass.toString.toPickList).asPredictor + val cabin = FeatureBuilder.PickList[PassengerDataAll].extract(_.getCabin.toPickList).asPredictor + val survived = FeatureBuilder.Binary[PassengerDataAll].extract(p => p.getSurvived.intValue.toBinary).asResponse + val survivedPred = FeatureBuilder.Binary[PassengerDataAll].extract(p => p.getSurvived.intValue.toBinary).asPredictor + val survivedNum = survived.occurs() + } + + + Spec[OpWorkflow] should + "return a binary classification model that runs cv at the workflow level" in new PassenserCSVforCV { + val fv = Seq(age, sex, fair, pClass, cabin).transmogrify() + val checked = survivedNum.sanityCheck(fv) + + val (pred1, _, prob1) = new BinaryClassificationModelSelector( + validator = new OpCrossValidation(evaluator = Evaluators.BinaryClassification.auPR(), numFolds = 2, seed = 0L), + splitter = Option(DataBalancer(sampleFraction = 0.01, reserveTestFraction = 0.2, seed = 0L)), + evaluators = Seq(new OpBinaryClassificationEvaluator) + ).setModelsToTry(LogisticRegression, RandomForest) + .setLogisticRegressionRegParam(10000) + .setLogisticRegressionElasticNetParam(0.01, 0.5) + .setRandomForestMaxBins(10) + .setInput(survivedNum, checked) + .getOutput() + + val wf1 = new OpWorkflow().withWorkflowCV.setResultFeatures(pred1, prob1) + wf1.isWorkflowCV shouldBe true + val model1 = wf1.setReader(simplePassengerForCV).train() + val data1 = model1.score(keepRawFeatures = false, keepIntermediateFeatures = false) + + val (pred2, _, prob2) = new BinaryClassificationModelSelector( + validator = new OpCrossValidation(evaluator = Evaluators.BinaryClassification.auPR(), numFolds = 2, seed = 0L), + splitter = Option(DataBalancer(sampleFraction = 0.01, reserveTestFraction = 0.2, seed = 0L)), + evaluators = Seq(new OpBinaryClassificationEvaluator) + ).setModelsToTry(LogisticRegression, RandomForest) + .setLogisticRegressionRegParam(10000) + .setLogisticRegressionElasticNetParam(0.01, 0.5) + .setRandomForestMaxBins(10) + .setInput(survivedNum, checked) + .getOutput() + + val wf2 = new OpWorkflow().setResultFeatures(pred2, prob2) + wf2.isWorkflowCV shouldBe false + val model2 = wf2.setReader(simplePassengerForCV).train() + val data2 = model2.score(keepRawFeatures = false, keepIntermediateFeatures = false) + + compare(data1, data2, pred1, pred2) + + val summary = model1.summary() + summary.contains(classOf[SanityChecker].getSimpleName) shouldBe true + summary.contains(ModelSelectorBaseNames.HoldOutEval) shouldBe true + summary.contains(ModelSelectorBaseNames.TrainingEval) shouldBe true + } + + it should "return a multi classification model that runs ts at the workflow level" in new PassenserCSVforCV { + val fv = Seq(age, sex, fair, pClass, cabin).transmogrify() + val checked = survivedNum.sanityCheck(fv) + + val (pred1, _, prob1) = new MultiClassificationModelSelector( + validator = new OpTrainValidationSplit(evaluator = Evaluators.MultiClassification.error()), + splitter = Option(DataCutter(reserveTestFraction = 0.2, seed = 0L)), + evaluators = Seq(new OpMultiClassificationEvaluator()) + ).setModelsToTry(LogisticRegression, DecisionTree) + .setLogisticRegressionMaxIter(10) + .setLogisticRegressionRegParam(0.1) + .setDecisionTreeMaxDepth(5, 10) + .setDecisionTreeMinInfoGain(100000) + .setInput(survivedNum, checked) + .getOutput() + + val wf1 = new OpWorkflow().withWorkflowCV.setResultFeatures(pred1, prob1) + wf1.isWorkflowCV shouldBe true + val model1 = wf1.setReader(simplePassengerForCV).train() + val data1 = model1.score(keepRawFeatures = false, keepIntermediateFeatures = false) + + + val (pred2, _, prob2) = new MultiClassificationModelSelector( + validator = new OpTrainValidationSplit(evaluator = Evaluators.MultiClassification.error()), + splitter = Option(DataCutter(reserveTestFraction = 0.2, seed = 0L)), + evaluators = Seq(new OpMultiClassificationEvaluator()) + ).setModelsToTry(LogisticRegression, DecisionTree) + .setLogisticRegressionMaxIter(10) + .setLogisticRegressionRegParam(0.1) + .setDecisionTreeMaxDepth(5, 10) + .setDecisionTreeMinInfoGain(100000) + .setInput(survivedNum, checked) + .getOutput() + + val wf2 = new OpWorkflow().setResultFeatures(pred2, prob2) + wf2.isWorkflowCV shouldBe false + val model2 = wf2.setReader(simplePassengerForCV).train() + val data2 = model2.score(keepRawFeatures = false, keepIntermediateFeatures = false) + + compare(data1, data2, pred1, pred2) + + val summary = model1.summary() + log.info(summary) + summary.contains(classOf[SanityChecker].getSimpleName) shouldBe true + summary.contains(ModelSelectorBaseNames.HoldOutEval) shouldBe true + summary.contains(ModelSelectorBaseNames.TrainingEval) shouldBe true + + } + + it should "return a regression model that runs cv at the workflow level" in new PassenserCSVforCV { + val fv = Seq(sex, fair, pClass, cabin, age).transmogrify() + val checked = survivedNum.sanityCheck(fv) + + val pred1 = new RegressionModelSelector( + validator = new OpCrossValidation(evaluator = Evaluators.Regression.r2()), + dataSplitter = None, + evaluators = Seq(new OpRegressionEvaluator()) + ).setModelsToTry(RegressionModelsToTry.LinearRegression, RegressionModelsToTry.RandomForestRegression) + .setLinearRegressionElasticNetParam(0.01) + .setRandomForestMinInfoGain(10000) + .setInput(survivedNum, checked) + .getOutput() + + val wf1 = new OpWorkflow().withWorkflowCV.setResultFeatures(pred1) + wf1.isWorkflowCV shouldBe true + val model1 = wf1.setReader(simplePassengerForCV).train() + val data1 = model1.score(keepRawFeatures = false, keepIntermediateFeatures = false) + + val pred2 = new RegressionModelSelector( + validator = new OpCrossValidation(evaluator = Evaluators.Regression.r2()), + dataSplitter = None, + evaluators = Seq(new OpRegressionEvaluator()) + ).setModelsToTry(RegressionModelsToTry.LinearRegression, RegressionModelsToTry.RandomForestRegression) + .setLinearRegressionElasticNetParam(0.01) + .setRandomForestMinInfoGain(10000) + .setInput(survivedNum, checked) + .getOutput() + + val wf2 = new OpWorkflow().setResultFeatures(pred2) + wf2.isWorkflowCV shouldBe false + val model2 = wf2.setReader(simplePassengerForCV).train() + val data2 = model2.score(keepRawFeatures = false, keepIntermediateFeatures = false) + + compare(data1, data2, pred1, pred2) + + val summary = model1.summary() + log.info(summary) + summary.contains(classOf[SanityChecker].getSimpleName) shouldBe true + summary.contains(ModelSelectorBaseNames.TrainingEval) shouldBe true + } + + it should "return a regression model that runs ts at the workflow level" in new PassenserCSVforCV { + val fv = Seq(sex, fair, pClass, cabin, age).transmogrify() + val checked = survivedNum.sanityCheck(fv) + + val pred1 = new RegressionModelSelector( + validator = new OpTrainValidationSplit(evaluator = Evaluators.Regression.r2()), + dataSplitter = Option(DataSplitter(seed = 0L)), + evaluators = Seq(new OpRegressionEvaluator()) + ).setModelsToTry(RegressionModelsToTry.LinearRegression, RegressionModelsToTry.GBTRegression) + .setLinearRegressionRegParam(100000) + .setGradientBoostedTreeLossType(LossType.Absolute) + .setInput(survivedNum, checked) + .getOutput() + + val wf1 = new OpWorkflow().withWorkflowCV.setResultFeatures(pred1) + wf1.isWorkflowCV shouldBe true + val model1 = wf1.setReader(simplePassengerForCV).train() + val data1 = model1.score(keepRawFeatures = false, keepIntermediateFeatures = false) + + val pred2 = new RegressionModelSelector( + validator = new OpTrainValidationSplit(evaluator = Evaluators.Regression.r2()), + dataSplitter = Option(DataSplitter(seed = 0L)), + evaluators = Seq(new OpRegressionEvaluator()) + ).setModelsToTry(RegressionModelsToTry.LinearRegression, RegressionModelsToTry.GBTRegression) + .setLinearRegressionRegParam(100000) + .setGradientBoostedTreeLossType(LossType.Absolute) + .setInput(survivedNum, checked) + .getOutput() + + val wf2 = new OpWorkflow().setResultFeatures(pred2) + wf2.isWorkflowCV shouldBe false + val model2 = wf2.setReader(simplePassengerForCV).train() + val data2 = model2.score(keepRawFeatures = false, keepIntermediateFeatures = false) + + compare(data1, data2, pred1, pred2) + + val summary = model1.summary() + log.info(summary) + summary.contains(classOf[SanityChecker].getSimpleName) shouldBe true + summary.contains(ModelSelectorBaseNames.HoldOutEval) shouldBe true + summary.contains(ModelSelectorBaseNames.TrainingEval) shouldBe true + } + + it should "avoid adding label leakage when feature engineering would introduce it" in new PassenserCSVforCV { + + val fairLeaker = fair.autoBucketize(survivedNum, trackNulls = false) + val ageLeaker = age.autoBucketize(survivedNum, trackNulls = false) + val fv = Seq(age, sex, ageLeaker, fairLeaker, pClass, cabin) + .transmogrify() + + val (pred1, _, _) = new BinaryClassificationModelSelector( + validator = new OpCrossValidation(evaluator = Evaluators.BinaryClassification.auPR(), numFolds = 2, seed = 0L), + splitter = Option(DataBalancer(sampleFraction = 0.01, reserveTestFraction = 0.2, seed = 0L)), + evaluators = Seq(new OpBinaryClassificationEvaluator) + ).setModelsToTry(LogisticRegression) + .setLogisticRegressionRegParam(0.0, 0.001, 0.1) + .setInput(survivedNum, fv) + .getOutput() + + val wf1 = new OpWorkflow().withWorkflowCV.setResultFeatures(pred1) + wf1.isWorkflowCV shouldBe true + val model1 = wf1.setReader(simplePassengerForCV).train() + val data1 = model1.score(keepRawFeatures = false, keepIntermediateFeatures = true) + + val (pred2, _, _) = new BinaryClassificationModelSelector( + validator = new OpCrossValidation(evaluator = Evaluators.BinaryClassification.auPR(), numFolds = 2, seed = 0L), + splitter = Option(DataBalancer(sampleFraction = 0.01, reserveTestFraction = 0.2, seed = 0L)), + evaluators = Seq(new OpBinaryClassificationEvaluator) + ).setModelsToTry(LogisticRegression) + .setLogisticRegressionRegParam(0.0, 0.001, 0.1) + .setInput(survivedNum, fv) + .getOutput() + + val wf2 = new OpWorkflow().setResultFeatures(pred2) + wf2.isWorkflowCV shouldBe false + val model2 = wf2.setReader(simplePassengerForCV).train() + val data2 = model2.score(keepRawFeatures = false, keepIntermediateFeatures = true) + + // CV + model1.summary().contains(""""area under PR" : "0.802""") shouldBe true + model1.summary().contains(""""area under PR" : "0.81""") shouldBe false + model2.summary().contains(""""area under PR" : "0.81""") shouldBe true + } + + def compare(data1: DataFrame, data2: DataFrame, f1: FeatureLike[_], f2: FeatureLike[_]): Unit = { + + val winner1 = f1.originStage.asInstanceOf[ModelSelectorBase[_, _]].bestEstimator.get + val winner2 = f2.originStage.asInstanceOf[ModelSelectorBase[_, _]].bestEstimator.get + winner1.estimator.getClass shouldEqual winner2.estimator.getClass + winner1.estimator.asInstanceOf[PipelineStage].extractParamMap.toSeq.sortBy(_.param.name).map(_.value) should + contain theSameElementsAs + winner2.estimator.asInstanceOf[PipelineStage].extractParamMap.toSeq.sortBy(_.param.name).map(_.value) + + val d1s = data1.collect().sortBy(_.getAs[String]("key")) + val d2s = data2.collect().sortBy(_.getAs[String]("key")) + d1s.zip(d2s).foreach{ + case (r1, r2) => + math.abs(r1.getDouble(0) - r2.getDouble(0)) < 0.5 shouldBe true + if (r1.size > 2) math.abs(r1.getAs[Vector](1)(0) - r2.getAs[Vector](1)(0) ) < 0.5 shouldBe true + } + } + +} + +class Leaker(uid: String = UID[BinaryTransformer[_, _, _]]) extends + BinaryTransformer[Real, RealNN, RealNN](operationName = "makeLeaker", uid = uid) { + override def transformFn: (Real, RealNN) => RealNN = + (f: Real, l: RealNN) => if (l.v.exists(_ > 0)) 1.0.toRealNN else 0.0.toRealNN + override def outputIsResponse: Boolean = false +} diff --git a/core/src/test/scala/com/salesforce/op/OpWorkflowCoreTest.scala b/core/src/test/scala/com/salesforce/op/OpWorkflowCoreTest.scala index 48b5eb843f..7e3107bf9a 100644 --- a/core/src/test/scala/com/salesforce/op/OpWorkflowCoreTest.scala +++ b/core/src/test/scala/com/salesforce/op/OpWorkflowCoreTest.scala @@ -32,7 +32,7 @@ package com.salesforce.op -import com.salesforce.op.DAG._ +import com.salesforce.op.utils.stages.FitStagesUtil._ import com.salesforce.op.features.FeatureLike import com.salesforce.op.features.types._ import com.salesforce.op.stages.impl.classification.{BinaryClassificationModelSelector, OpLogisticRegression} @@ -43,8 +43,8 @@ import com.salesforce.op.test.{TestFeatureBuilder, TestSparkContext} import com.salesforce.op.testkit.{RandomBinary, RandomReal, RandomVector} import org.apache.spark.ml.{Estimator, Model} import org.junit.runner.RunWith -import org.scalatest.junit.JUnitRunner import org.scalatest.FlatSpec +import org.scalatest.junit.JUnitRunner @RunWith(classOf[JUnitRunner]) @@ -74,13 +74,15 @@ class OpWorkflowCoreTest extends FlatSpec with TestSparkContext { // Workflow val wf = new OpWorkflow() - Spec[OpWorkflowCore] should "handle empty DAG" in { assert( res = cutDAG(wf), - modelSelector = None, - nonCVTSDAG = Array.empty[Layer], - cVTSDAG = Array.empty[Layer] + expected = CutDAG( + modelSelector = None, + before = Array.empty[Layer], + during = Array.empty[Layer], + after = Array.empty[Layer] + ) ) } @@ -90,9 +92,12 @@ class OpWorkflowCoreTest extends FlatSpec with TestSparkContext { assert( res = cutDAG(wf.setResultFeatures(pred)), - modelSelector = Option(ms.stage1), - nonCVTSDAG = Array.empty[Layer], - cVTSDAG = Array.empty[Layer] + expected = CutDAG( + modelSelector = Option((ms.stage1, 0)), + before = Array.empty[Layer], + during = Array.empty[Layer], + after = Array.empty[Layer] + ) ) } @@ -104,9 +109,31 @@ class OpWorkflowCoreTest extends FlatSpec with TestSparkContext { assert( res = cutDAG(wf.setResultFeatures(pred)), - modelSelector = Option(ms.stage1), - nonCVTSDAG = Array(Array((lda, 2))), - cVTSDAG = Array(Array((sanityChecker, 1))) + expected = CutDAG( + modelSelector = Option((ms.stage1, 0)), + before = Array(Array((lda, 2))), + during = Array(Array((sanityChecker, 1))), + after = Array.empty[Layer] + ) + ) + } + + it should "cut simple DAG with nonCVTS and cVTS stage and stages after CV" in { + val ldaFeatures = lda.setInput(features).getOutput() + val checkedFeatures = sanityChecker.setInput(label, ldaFeatures).getOutput() + val ms = BinaryClassificationModelSelector() + val (pred, _, _) = ms.setInput(label, checkedFeatures).getOutput() + val zNormalize = new OpScalarStandardScaler() + val realPred = zNormalize.setInput(pred).getOutput() + + assert( + res = cutDAG(wf.setResultFeatures(realPred)), + expected = CutDAG( + modelSelector = Option((ms.stage1, 1)), + before = Array(Array((lda, 3))), + during = Array(Array((sanityChecker, 2))), + after = Array(Array((zNormalize, 0))) + ) ) } @@ -117,9 +144,12 @@ class OpWorkflowCoreTest extends FlatSpec with TestSparkContext { assert( res = cutDAG(wf.setResultFeatures(pred)), - modelSelector = Option(ms.stage1), - nonCVTSDAG = Array.empty[Layer], - cVTSDAG = Array(Array((sanityChecker, 1))) + expected = CutDAG( + modelSelector = Option((ms.stage1, 0)), + before = Array.empty[Layer], + during = Array(Array((sanityChecker, 1))), + after = Array.empty[Layer] + ) ) } @@ -130,9 +160,12 @@ class OpWorkflowCoreTest extends FlatSpec with TestSparkContext { assert( res = cutDAG(wf.setResultFeatures(pred)), - modelSelector = Option(ms.stage1), - nonCVTSDAG = Array(Array((lda, 1))), - cVTSDAG = Array.empty[Layer] + expected = CutDAG( + modelSelector = Option((ms.stage1, 0)), + before = Array(Array((lda, 1))), + during = Array.empty[Layer], + after = Array.empty[Layer] + ) ) } @@ -142,9 +175,12 @@ class OpWorkflowCoreTest extends FlatSpec with TestSparkContext { assert( res = cutDAG(wf.setResultFeatures(checkedFeatures)), - modelSelector = None, - nonCVTSDAG = Array.empty[Layer], - cVTSDAG = Array.empty[Layer] + expected = CutDAG( + modelSelector = None, + before = Array.empty[Layer], + during = Array.empty[Layer], + after = Array.empty[Layer] + ) ) } @@ -176,13 +212,16 @@ class OpWorkflowCoreTest extends FlatSpec with TestSparkContext { val ldaFeatures = lda.setInput(features).getOutput() val checkedFeatures = sanityChecker.setInput(label2, ldaFeatures).getOutput() val (pred, _, _) = ms.setInput(label, features).getOutput() - val (predLogReg, _, _) = logReg.setInput(label2, checkedFeatures).getOutput() + val predLogReg = logReg.setInput(label2, checkedFeatures).getOutput() assert( res = cutDAG(wf.setResultFeatures(pred, predLogReg)), - modelSelector = Option(ms.stage1), - nonCVTSDAG = Array(Array((lda, 2)), Array((sanityChecker, 1)), Array((logReg.stage1, 0))), - cVTSDAG = Array.empty[Layer] + expected = CutDAG( + modelSelector = Option((ms.stage1, 0)), + before = Array(Array((lda, 2)), Array((sanityChecker, 1)), Array((logReg, 0))), + during = Array.empty[Layer], + after = Array.empty[Layer] + ) ) } @@ -196,34 +235,27 @@ class OpWorkflowCoreTest extends FlatSpec with TestSparkContext { assert( res = cutDAG(wf.setResultFeatures(pred)), - modelSelector = Option(ms.stage1), - nonCVTSDAG = Array(Array((lda, 2), (zNormalize, 2))), - cVTSDAG = Array(Array((sanityChecker, 1))) + expected = CutDAG( + modelSelector = Option((ms.stage1, 0)), + before = Array(Array((lda, 2), (zNormalize, 2))), + during = Array(Array((sanityChecker, 1))), + after = Array.empty[Layer] + ) ) } - /** - * Shortcut function to cut DAG - * - * @param wf Workflow - * @return Cut DAG - */ - private def cutDAG(wf: OpWorkflow): (Option[MS], StagesDAG, StagesDAG) = { - wf.cutDAG(DAG.compute(wf.getResultFeatures())) - } - /** * Compare Actual and expected cut DAGs * - * @param res Actual results - * @param modelSelector Expected Model Selector - * @param nonCVTSDAG Expected nonCVTS DAG - * @param cVTSDAG Expected cVTS DAG + * @param res actual cut + * @param expected expected cut */ - private def assert(res: (Option[MS], StagesDAG, StagesDAG), - modelSelector: Option[MS], nonCVTSDAG: StagesDAG, cVTSDAG: StagesDAG): Unit = { - res._1 shouldBe modelSelector - res._2 shouldBe nonCVTSDAG - res._3 shouldBe cVTSDAG + private def assert(res: CutDAG, expected: CutDAG): Unit = { + res.modelSelector shouldBe expected.modelSelector + res.before should contain theSameElementsInOrderAs expected.before + res.during should contain theSameElementsInOrderAs expected.during + res.after should contain theSameElementsInOrderAs expected.after } } + + diff --git a/core/src/test/scala/com/salesforce/op/OpWorkflowModelReaderWriterTest.scala b/core/src/test/scala/com/salesforce/op/OpWorkflowModelReaderWriterTest.scala index c3f60759b8..132be1ee0a 100644 --- a/core/src/test/scala/com/salesforce/op/OpWorkflowModelReaderWriterTest.scala +++ b/core/src/test/scala/com/salesforce/op/OpWorkflowModelReaderWriterTest.scala @@ -51,7 +51,8 @@ import org.slf4j.LoggerFactory @RunWith(classOf[JUnitRunner]) -class OpWorkflowModelReaderWriterTest extends FlatSpec with PassengerSparkFixtureTest with BeforeAndAfterEach { +class OpWorkflowModelReaderWriterTest + extends FlatSpec with UIDReset with PassengerSparkFixtureTest with BeforeAndAfterEach { implicit val jsonFormats: Formats = DefaultFormats val log = LoggerFactory.getLogger(this.getClass) @@ -71,7 +72,6 @@ class OpWorkflowModelReaderWriterTest extends FlatSpec with PassengerSparkFixtur saveModelPath = tempDir + "/op-rw-wf-model-test-" + DateTime.now().getMillis } - override def afterAll: Unit = { super.afterAll deleteRecursively(new File(saveFlowPathStable)) @@ -237,8 +237,7 @@ class OpWorkflowModelReaderWriterTest extends FlatSpec with PassengerSparkFixtur compareWorkflowModels(model, wfMR) } - trait VectorizedFlow { - UID.reset() + trait VectorizedFlow extends UIDReset { val cat = Seq(gender, boarded, height, age, description).transmogrify() val catHead = cat.map[Real](v => Real(v.value.toArray.headOption)) val wf = new OpWorkflow() @@ -324,3 +323,7 @@ class OpWorkflowModelReaderWriterTest extends FlatSpec with PassengerSparkFixtur p1.customParams shouldBe p2.customParams } } + +trait UIDReset { + UID.reset() +} diff --git a/core/src/test/scala/com/salesforce/op/OpWorkflowRunnerTest.scala b/core/src/test/scala/com/salesforce/op/OpWorkflowRunnerTest.scala index f735b2ec80..2397316236 100644 --- a/core/src/test/scala/com/salesforce/op/OpWorkflowRunnerTest.scala +++ b/core/src/test/scala/com/salesforce/op/OpWorkflowRunnerTest.scala @@ -37,7 +37,8 @@ import com.salesforce.op.OpWorkflowRunType._ import com.salesforce.op.evaluators.{BinaryClassificationMetrics, Evaluators} import com.salesforce.op.features.types._ import com.salesforce.op.readers.DataFrameFieldNames._ -import com.salesforce.op.stages.impl.classification.OpLogisticRegression +import com.salesforce.op.stages.impl.classification.ClassificationModelsToTry.LogisticRegression +import com.salesforce.op.stages.impl.classification.{BinaryClassificationModelSelector, OpLogisticRegression} import com.salesforce.op.test.{PassengerSparkFixtureTest, TestSparkStreamingContext} import com.salesforce.op.utils.spark.AppMetrics import com.salesforce.op.utils.spark.RichDataset._ @@ -68,10 +69,15 @@ class OpWorkflowRunnerTest extends AsyncFlatSpec private val features = Seq(height, weight, gender, description, age).transmogrify() private val survivedNum = survived.occurs() - val (pred, raw, prob) = new OpLogisticRegression().setInput(survivedNum, features).getOutput() + // TODO put back LR when evaluators work with prediction features + val (pred, raw, prob) = BinaryClassificationModelSelector.withTrainValidationSplit(None) + .setModelsToTry(LogisticRegression) + .setLogisticRegressionRegParam(0) + .setInput(survivedNum, features).getOutput() private val workflow = new OpWorkflow().setResultFeatures(pred, raw, survivedNum).setReader(dataReader) private val evaluator = Evaluators.BinaryClassification().setLabelCol(survivedNum).setPredictionCol(pred).setRawPredictionCol(raw) + .setProbabilityCol(prob) val metricsPromise = Promise[AppMetrics]() @@ -138,7 +144,7 @@ class OpWorkflowRunnerTest extends AsyncFlatSpec metricsLocation = Some(modelMetricsLocation.toString) ) val res = doRun[TrainResult](runConfig, modelLocation, modelMetricsLocation) - res.modelSummary shouldBe "{ }" + res.modelSummary.nonEmpty shouldBe true } it should "score a dataset with a trained model" in { diff --git a/core/src/test/scala/com/salesforce/op/OpWorkflowTest.scala b/core/src/test/scala/com/salesforce/op/OpWorkflowTest.scala index f889f4c35c..9c5708ea15 100644 --- a/core/src/test/scala/com/salesforce/op/OpWorkflowTest.scala +++ b/core/src/test/scala/com/salesforce/op/OpWorkflowTest.scala @@ -31,26 +31,27 @@ package com.salesforce.op -import com.salesforce.op.evaluators.{BinaryClassificationMetrics, Evaluators} -import com.salesforce.op.features.OPFeature +import com.salesforce.op.evaluators._ +import com.salesforce.op.features._ import com.salesforce.op.features.types._ import com.salesforce.op.filters.RawFeatureFilter import com.salesforce.op.readers.DataFrameFieldNames._ import com.salesforce.op.readers._ import com.salesforce.op.stages.base.unary._ import com.salesforce.op.stages.impl.classification.ClassificationModelsToTry._ -import com.salesforce.op.stages.impl.classification.{BinaryClassificationModelSelector, Stage1BinaryClassificationModelSelector} +import com.salesforce.op.stages.impl.classification._ import com.salesforce.op.stages.impl.preparators.SanityChecker -import com.salesforce.op.stages.impl.selector.ModelSelectorBaseNames -import com.salesforce.op.stages.impl.tuning.DataBalancer -import com.salesforce.op.test.{Passenger, PassengerSparkFixtureTest, TestFeatureBuilder} +import com.salesforce.op.stages.impl.regression.{LossType, RegressionModelSelector, RegressionModelsToTry} +import com.salesforce.op.stages.impl.selector.{ModelSelectorBaseNames, SelectedModel} +import com.salesforce.op.stages.impl.tuning._ +import com.salesforce.op.test.{Passenger, PassengerCSV, PassengerSparkFixtureTest, TestFeatureBuilder} import com.salesforce.op.utils.spark.RichDataset._ import com.salesforce.op.utils.spark.{OpVectorColumnMetadata, OpVectorMetadata} import org.apache.spark.ml.param.BooleanParam import org.apache.spark.rdd.RDD import org.apache.spark.sql.types.{DoubleType, StringType} import org.apache.spark.sql.{Dataset, SparkSession} -import org.joda.time.DateTime +import org.joda.time.{DateTime, Duration} import org.junit.runner.RunWith import org.scalatest.FlatSpec import org.scalatest.junit.JUnitRunner @@ -158,7 +159,11 @@ class OpWorkflowTest extends FlatSpec with PassengerSparkFixtureTest { .setInput(survivedNum, checked).getOutput() val wf = new OpWorkflow() .setResultFeatures(whyNotNormed, prob) - .withRawFeatureFilter(Option(dataReader), None, minFillRate = 0.7) + .withRawFeatureFilter( + trainingReader = Option(dataReader), + scoringReader = None, + minFillRate = 0.7, + protectedFeatures = Array(height, weight)) val wfM = wf.train() val data = wfM.score() @@ -183,7 +188,8 @@ class OpWorkflowTest extends FlatSpec with PassengerSparkFixtureTest { it should "be able to compute a partial dataset in both workflow and workflow model" in { val fields = - List(KeyFieldName, height.name, weight.name, heightNormed.name, density.name, densityByHeightNormed.name) + List(KeyFieldName, height.name, weight.name, heightNormed.name, density.name, + densityByHeightNormed.name, whyNotNormed.name) val data = workflow.setReader(dataReader).computeDataUpTo(whyNotNormed) data.schema.fieldNames should contain theSameElementsAs fields @@ -439,6 +445,7 @@ class OpWorkflowTest extends FlatSpec with PassengerSparkFixtureTest { .setRawPredictionCol(rawPred) .setLabelCol(survivedNum) .setPredictionCol(pred) + .setProbabilityCol(prob) val scores1 = fittedWorkflow.score(keepIntermediateFeatures = true) val (scores2, metrics) = fittedWorkflow.scoreAndEvaluate(evaluator = evaluator, keepIntermediateFeatures = true) @@ -452,7 +459,12 @@ class OpWorkflowTest extends FlatSpec with PassengerSparkFixtureTest { scores1.schema.fields.map(_.metadata.toString()) should contain theSameElementsAs scores2.schema.fields.map(_.metadata.toString()) - metrics shouldBe BinaryClassificationMetrics(1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 5.0, 0.0, 0.0) + val probs = scores2.collect(prob) + val thresholds = probs.map(_.value(1)).distinct.sorted.reverse + + metrics shouldBe BinaryClassificationMetrics(1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 5.0, 0.0, 0.0, + thresholds.toSeq, Seq(1.0, 0.5, 0.25, 0.2, 1.0/6), Seq(1.0, 1.0, 1.0, 1.0, 1.0), + Seq(0.0, 0.2, 0.6, 0.8, 1.0)) } it should "return an empty data set if passed empty data for scoring" in { diff --git a/core/src/test/scala/com/salesforce/op/evaluators/EvaluatorsTest.scala b/core/src/test/scala/com/salesforce/op/evaluators/EvaluatorsTest.scala index 9689ff3c9a..909f254fa5 100644 --- a/core/src/test/scala/com/salesforce/op/evaluators/EvaluatorsTest.scala +++ b/core/src/test/scala/com/salesforce/op/evaluators/EvaluatorsTest.scala @@ -32,7 +32,8 @@ package com.salesforce.op.evaluators import com.salesforce.op.features.types._ -import com.salesforce.op.stages.impl.classification.OpLogisticRegression +import com.salesforce.op.stages.impl.classification.ClassificationModelsToTry.LogisticRegression +import com.salesforce.op.stages.impl.classification.{BinaryClassificationModelSelector, OpLogisticRegression} import com.salesforce.op.test.{TestFeatureBuilder, TestSparkContext} import org.apache.spark.ml.evaluation.{BinaryClassificationEvaluator, MulticlassClassificationEvaluator, RegressionEvaluator} import org.apache.spark.ml.linalg.Vectors @@ -80,7 +81,11 @@ class EvaluatorsTest extends FlatSpec with TestSparkContext { ) val test_label = test_rawLabel.copy(isResponse = true) - val testEstimator = new OpLogisticRegression().setInput(label, features) + // TODO put back LR when evaluators work with prediction features + val testEstimator = BinaryClassificationModelSelector() + .setModelsToTry(LogisticRegression) + .setLogisticRegressionRegParam(0) + .setInput(label, features) val (pred, rawPred, prob) = testEstimator.getOutput() val model = testEstimator.fit(ds) val transformedData = model.setInput(test_label, test_features).transform(test_ds) diff --git a/core/src/test/scala/com/salesforce/op/evaluators/OpBinaryClassificationEvaluatorTest.scala b/core/src/test/scala/com/salesforce/op/evaluators/OpBinaryClassificationEvaluatorTest.scala index 889041569a..a89009b8dd 100644 --- a/core/src/test/scala/com/salesforce/op/evaluators/OpBinaryClassificationEvaluatorTest.scala +++ b/core/src/test/scala/com/salesforce/op/evaluators/OpBinaryClassificationEvaluatorTest.scala @@ -32,9 +32,9 @@ package com.salesforce.op.evaluators import com.salesforce.op.evaluators.BinaryClassEvalMetrics._ -import com.salesforce.op.evaluators.MultiClassEvalMetrics._ import com.salesforce.op.features.types._ -import com.salesforce.op.stages.impl.classification.OpLogisticRegression +import com.salesforce.op.stages.impl.classification.ClassificationModelsToTry.LogisticRegression +import com.salesforce.op.stages.impl.classification.{BinaryClassificationModelSelector, OpLogisticRegression} import com.salesforce.op.test.{TestFeatureBuilder, TestSparkContext} import org.apache.spark.ml.evaluation._ import org.apache.spark.ml.linalg.Vectors @@ -102,11 +102,16 @@ class OpBinaryClassificationEvaluatorTest extends FlatSpec with TestSparkContext ) val one_label = one_rawLabel.copy(isResponse = true) - val testEstimator = new OpLogisticRegression().setInput(label, features) - val (pred, rawPred, _) = testEstimator.getOutput() + // TODO put back LR when evaluators work with prediction features + val testEstimator = BinaryClassificationModelSelector() + .setModelsToTry(LogisticRegression) + .setLogisticRegressionRegParam(0) + .setInput(label, features) + val (pred, rawPred, prob) = testEstimator.getOutput() val testEvaluator = new OpBinaryClassificationEvaluator().setLabelCol(label) .setPredictionCol(pred) .setRawPredictionCol(rawPred) + .setProbabilityCol(prob) val model = testEstimator.fit(ds) val sparkBinaryEvaluator = new BinaryClassificationEvaluator() val sparkMulticlassEvaluator = new MulticlassClassificationEvaluator() @@ -122,8 +127,9 @@ class OpBinaryClassificationEvaluatorTest extends FlatSpec with TestSparkContext sparkBinaryEvaluator.setLabelCol(label.name).setRawPredictionCol(rawPred.name) sparkMulticlassEvaluator.setLabelCol(label.name).setPredictionCol(pred.name) - metrics.AuROC shouldBe sparkBinaryEvaluator.setMetricName(AuROC.sparkEntryName).evaluate(transformedData) - metrics.AuPR shouldBe sparkBinaryEvaluator.setMetricName(AuPR.sparkEntryName).evaluate(transformedData) + // TODO: These are no longer the same since we now use probabilities as thresholds, and Spark uses rawPredictions + // metrics.AuROC shouldBe sparkBinaryEvaluator.setMetricName(AuROC.sparkEntryName).evaluate(transformedData) + // metrics.AuPR shouldBe sparkBinaryEvaluator.setMetricName(AuPR.sparkEntryName).evaluate(transformedData) val (tp, tn, fp, fn, precision, recall, f1) = getPosNegValues( transformedData.select(pred.name, test_label.name).rdd diff --git a/core/src/test/scala/com/salesforce/op/evaluators/OpMultiClassificationEvaluatorTest.scala b/core/src/test/scala/com/salesforce/op/evaluators/OpMultiClassificationEvaluatorTest.scala new file mode 100644 index 0000000000..beea884623 --- /dev/null +++ b/core/src/test/scala/com/salesforce/op/evaluators/OpMultiClassificationEvaluatorTest.scala @@ -0,0 +1,145 @@ +/* + * Copyright (c) 2017, Salesforce.com, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of Salesforce.com nor the names of its contributors may + * be used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +package com.salesforce.op.evaluators + +import com.salesforce.op.features.types._ +import com.salesforce.op.test.{TestFeatureBuilder, TestSparkContext} +import org.apache.spark.ml.linalg.Vectors +import org.junit.runner.RunWith +import org.scalatest.FlatSpec +import org.scalatest.junit.JUnitRunner + + +@RunWith(classOf[JUnitRunner]) +class OpMultiClassificationEvaluatorTest extends FlatSpec with TestSparkContext { + + // loggingLevel(Level.INFO) + + val numRows = 1000L + val (dsMulti, labelRawMulti, predMulti, rawPredMulti, probMulti) = + TestFeatureBuilder[RealNN, RealNN, OPVector, OPVector](Seq.fill(numRows.toInt)( + (1.0, 0.0, Vectors.dense(10.0, 5.0, 1.0, 0.0, 0.0), Vectors.dense(0.70, 0.25, 0.05, 0.0, 0.0)) + ).map(v => (v._1.toRealNN, v._2.toRealNN, v._3.toOPVector, v._4.toOPVector)) + ) + val labelMulti = labelRawMulti.copy(isResponse = true) + val defaultThresholds = (0 to 100).map(_ / 100.0).toArray + val defaultTopNs = Array(1, 3) + + Spec[OpMultiClassificationEvaluator] should "determine incorrect/correct counts from the thresholds" in { + val evaluatorMulti = new OpMultiClassificationEvaluator() + .setLabelCol(labelMulti) + .setPredictionCol(predMulti) + .setRawPredictionCol(rawPredMulti) + .setProbabilityCol(probMulti) + + val metricsMulti = evaluatorMulti.evaluateAll(dsMulti) + + // Predictions should never be correct for top1 (since correct class has 2nd highest probability). + // For top3, it should be correct up to a threshold of 0.25 + val expectedCorrects = Map( + 1 -> Seq.fill(defaultThresholds.length)(0L), + 3 -> (Seq.fill(26)(numRows) ++ Seq.fill(defaultThresholds.length - 26)(0L)) + ) + // For top1, prediction is incorrect up to a threshold of 0.7, and then no prediction + // For top3, prediction is incorrect in the threshold range (0.25, 0.7], then no prediction + val expectedIncorrects = Map( + 1 -> (Seq.fill(71)(numRows) ++ Seq.fill(defaultThresholds.length - 71)(0L)), + 3 -> (Seq.fill(26)(0L) ++ Seq.fill(71 - 26)(numRows) ++ Seq.fill(defaultThresholds.length - 71)(0L)) + ) + val expectedNoPredictons = Map( + 1 -> (Seq.fill(71)(0L) ++ Seq.fill(defaultThresholds.length - 71)(numRows)), + 3 -> (Seq.fill(26)(0L) ++ Seq.fill(71 - 26)(0L) ++ Seq.fill(defaultThresholds.length - 71)(numRows)) + ) + + metricsMulti.ThresholdMetrics shouldEqual ThresholdMetrics( + topNs = defaultTopNs, + thresholds = defaultThresholds, + correctCounts = expectedCorrects, + incorrectCounts = expectedIncorrects, + noPredictionCounts = expectedNoPredictons + ) + } + + it should "have settable thresholds and topNs" in { + val thresholds = Array(0.1, 0.2, 0.5, 0.8, 0.9, 1.0) + val topNs = Array(1, 4, 12) + + val evaluatorMulti = new OpMultiClassificationEvaluator() + .setLabelCol(labelMulti) + .setPredictionCol(predMulti) + .setRawPredictionCol(rawPredMulti) + .setProbabilityCol(probMulti) + .setThresholds(thresholds) + .setTopNs(topNs) + + val metricsMulti = evaluatorMulti.evaluateAll(dsMulti) + + // Predictions should never be correct for top1 (since correct class has 2nd highest probability). + // For top4 & top12, it should be correct up to a threshold of 0.25 + val expectedCorrects = Map( + 1 -> Seq(0L, 0L, 0L, 0L, 0L, 0L), + 4 -> Seq(numRows, numRows, 0L, 0L, 0L, 0L), + 12 -> Seq(numRows, numRows, 0L, 0L, 0L, 0L) + ) + // For top1, prediction is incorrect up to a threshold of 0.7, and then no prediction + // For top4 & top 12, prediction is incorrect in the threshold range (0.25, 0.7], then no prediction + val expectedIncorrects = Map( + 1 -> Seq(numRows, numRows, numRows, 0L, 0L, 0L), + 4 -> Seq(0L, 0L, numRows, 0L, 0L, 0L), + 12 -> Seq(0L, 0L, numRows, 0L, 0L, 0L) + ) + val expectedNoPredictons = Map( + 1 -> Seq(0L, 0L, 0L, numRows, numRows, numRows), + 4 -> Seq(0L, 0L, 0L, numRows, numRows, numRows), + 12 -> Seq(0L, 0L, 0L, numRows, numRows, numRows) + ) + + metricsMulti.ThresholdMetrics shouldEqual ThresholdMetrics( + topNs = topNs, + thresholds = thresholds, + correctCounts = expectedCorrects, + incorrectCounts = expectedIncorrects, + noPredictionCounts = expectedNoPredictons + ) + } + + it should "not allow topNs to be negative or 0" in { + intercept[java.lang.IllegalArgumentException](new OpMultiClassificationEvaluator().setTopNs(Array(0, 1, 3))) + intercept[java.lang.IllegalArgumentException](new OpMultiClassificationEvaluator().setTopNs(Array(1, -4, 3))) + } + + it should "not allow thresholds to be out of the range [0.0, 1.0]" in { + intercept[java.lang.IllegalArgumentException](new OpMultiClassificationEvaluator().setThresholds(Array(-0.1, 0.4))) + intercept[java.lang.IllegalArgumentException](new OpMultiClassificationEvaluator().setThresholds(Array(1.1, 0.4))) + } + +} diff --git a/core/src/test/scala/com/salesforce/op/evaluators/OpRegressionEvaluatorTest.scala b/core/src/test/scala/com/salesforce/op/evaluators/OpRegressionEvaluatorTest.scala index 16cc8978b2..c75498f56f 100644 --- a/core/src/test/scala/com/salesforce/op/evaluators/OpRegressionEvaluatorTest.scala +++ b/core/src/test/scala/com/salesforce/op/evaluators/OpRegressionEvaluatorTest.scala @@ -32,7 +32,8 @@ package com.salesforce.op.evaluators import com.salesforce.op.features.types._ -import com.salesforce.op.stages.impl.regression.OpLinearRegression +import com.salesforce.op.stages.impl.regression.RegressionModelsToTry.LinearRegression +import com.salesforce.op.stages.impl.regression.{OpLinearRegression, RegressionModelSelector} import com.salesforce.op.test.{TestFeatureBuilder, TestSparkContext} import org.apache.spark.ml.linalg.Vectors import org.apache.spark.ml.param.ParamMap @@ -54,7 +55,11 @@ class OpRegressionEvaluatorTest extends FlatSpec with TestSparkContext { ) val label = rawLabel.copy(isResponse = true) - val testEstimator = new OpLinearRegression().setInput(label, features) + // TODO put back LR when evaluators work with prediction features + val testEstimator = RegressionModelSelector.withTrainValidationSplit(dataSplitter = None, trainRatio = 0.5) + .setModelsToTry(LinearRegression) + .setLinearRegressionRegParam(0) + .setInput(label, features) val prediction = testEstimator.getOutput() val testEvaluator = new OpRegressionEvaluator().setLabelCol(label).setPredictionCol(prediction) diff --git a/core/src/test/scala/com/salesforce/op/features/TransientFeatureTest.scala b/core/src/test/scala/com/salesforce/op/features/TransientFeatureTest.scala index de7fc287cb..f58735d317 100644 --- a/core/src/test/scala/com/salesforce/op/features/TransientFeatureTest.scala +++ b/core/src/test/scala/com/salesforce/op/features/TransientFeatureTest.scala @@ -76,6 +76,20 @@ class TransientFeatureTest extends FlatSpec with PassengerFeaturesTest with Test assertThrows[RuntimeException] { t.getFeature() } } + it should "be equal to self" in { + tf shouldBe tf + tf.equals(tf) shouldBe true + } + + it should "not be equal to a different instance" in { + val other = TransientFeature(weight) + tf should not be other + tf.equals(other) shouldBe false + } + + it should "have hash code of it's uid" in { + tf.hashCode() shouldBe tf.uid.hashCode + } it should "cast back to FeatureLike" in { tf.asFeatureLike[Real] shouldBe height diff --git a/core/src/test/scala/com/salesforce/op/filters/FeatureDistributionTest.scala b/core/src/test/scala/com/salesforce/op/filters/FeatureDistributionTest.scala new file mode 100644 index 0000000000..9a4f77c1b6 --- /dev/null +++ b/core/src/test/scala/com/salesforce/op/filters/FeatureDistributionTest.scala @@ -0,0 +1,146 @@ +/* + * Copyright (c) 2017, Salesforce.com, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of Salesforce.com nor the names of its contributors may + * be used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +package com.salesforce.op.filters + +import com.salesforce.op.OpParams +import com.salesforce.op.features.{OPFeature, TransientFeature} +import com.salesforce.op.stages.impl.feature.HashAlgorithm +import com.salesforce.op.test.PassengerSparkFixtureTest +import com.salesforce.op.utils.spark.RichDataset._ +import org.apache.spark.mllib.feature.HashingTF +import org.junit.runner.RunWith +import org.scalatest.FlatSpec +import org.scalatest.junit.JUnitRunner + +@RunWith(classOf[JUnitRunner]) +class FeatureDistributionTest extends FlatSpec with PassengerSparkFixtureTest with FiltersTestData { + + Spec[FeatureDistribution] should "be correctly created for features" in { + val features = Array(survived, age, gender, height, weight).map(TransientFeature.apply) + val values: Array[(Boolean, ProcessedSeq)] = Array( + (false, Right(Seq(1.0))), (true, Right(Seq.empty[Double])), (false, Left(Seq("male", "female"))), + (true, Left(Seq.empty[String])), (false, Right(Seq(1.0, 3.0, 5.0))) + ) + val summary = + Array(Summary(0.0, 1.0), Summary(-1.6, 10.6), Summary(0.0, 3.0), Summary(0.0, 0.0), Summary(1.0, 5.0)) + val bins = 10 + val hasher: HashingTF = new HashingTF(numFeatures = bins) + .setBinary(false) + .setHashAlgorithm(HashAlgorithm.MurMur3.toString.toLowerCase) + + val featureKeys: Array[FeatureKey] = features.map(f => (f.name, None)) + val processedSeqs: Array[Option[ProcessedSeq]] = values.map { case (isEmpty, processed) => + if (isEmpty) None else Option(processed) + } + val distribs = featureKeys.zip(summary).zip(processedSeqs).map { case ((key, summ), seq) => + FeatureDistribution(key, summ, seq, bins, hasher) + } + distribs.foreach{ d => + d.key shouldBe None + d.count shouldBe 1 + d.distribution.length shouldBe bins + } + distribs(0).nulls shouldBe 0 + distribs(1).nulls shouldBe 1 + distribs(1).distribution.sum shouldBe 0 + distribs(2).distribution.sum shouldBe 2 + distribs(2).summaryInfo should contain theSameElementsAs Array(0.0, 3.0) + distribs(3).distribution.sum shouldBe 0 + distribs(4).distribution.sum shouldBe 3 + distribs(4).summaryInfo.length shouldBe bins + } + + it should "be correctly created for map features" in { + val features = Array(stringMap, numericMap, booleanMap).map(TransientFeature.apply) + val values: Array[Map[String, ProcessedSeq]] = Array( + Map("A" -> Left(Seq("male", "female"))), + Map("A" -> Right(Seq(1.0)), "B" -> Right(Seq(1.0))), + Map("B" -> Right(Seq(0.0)))) + val summary = Array( + Map("A" -> Summary(0.0, 1.0), "B" -> Summary(0.0, 5.0)), + Map("A" -> Summary(-1.6, 10.6), "B" -> Summary(0.0, 3.0)), + Map("B" -> Summary(0.0, 0.0))) + val bins = 10 + val hasher: HashingTF = new HashingTF(numFeatures = bins) + .setBinary(false) + .setHashAlgorithm(HashAlgorithm.MurMur3.toString.toLowerCase) + val distribs = features.map(_.name).zip(summary).zip(values).flatMap { case ((name, summaryMaps), valueMaps) => + summaryMaps.map { case (key, summary) => + val featureKey = (name, Option(key)) + FeatureDistribution(featureKey, summary, valueMaps.get(key), bins, hasher) + } + } + + distribs.length shouldBe 5 + distribs.foreach{ d => + d.key.contains("A") || d.key.contains("B") shouldBe true + d.count shouldBe 1 + if (d.name != "booleanMap") d.distribution.length shouldBe bins + else d.distribution.length shouldBe 2 + } + distribs(0).nulls shouldBe 0 + distribs(0).summaryInfo should contain theSameElementsAs Array(0.0, 1.0) + distribs(1).nulls shouldBe 1 + distribs(0).distribution.sum shouldBe 2 + distribs(1).distribution.sum shouldBe 0 + distribs(2).summaryInfo.length shouldBe bins + distribs(2).distribution.sum shouldBe 1 + distribs(4).distribution(0) shouldBe 1 + distribs(4).distribution(1) shouldBe 0 + distribs(4).summaryInfo.length shouldBe 2 + } + + it should "correctly compare fill rates" in { + val fd1 = FeatureDistribution("A", None, 10, 1, Array.empty, Array.empty) + val fd2 = FeatureDistribution("A", None, 20, 20, Array.empty, Array.empty) + fd1.relativeFillRate(fd2) shouldBe 0.9 + } + + it should "correctly compare relative fill rates" in { + val fd1 = FeatureDistribution("A", None, 10, 1, Array.empty, Array.empty) + val fd2 = FeatureDistribution("A", None, 20, 19, Array.empty, Array.empty) + trainSummaries(0).relativeFillRatio(scoreSummaries(0)) shouldBe 4.5 + trainSummaries(2).relativeFillRatio(scoreSummaries(2)) shouldBe 1.0 + fd1.relativeFillRatio(fd2) shouldBe 18.0 + } + + it should "correctly compute the DS divergence" in { + val fd1 = FeatureDistribution("A", None, 10, 1, Array(1, 4, 0, 0, 6), Array.empty) + val fd2 = FeatureDistribution("A", None, 20, 20, Array(2, 8, 0, 0, 12), Array.empty) + fd1.jsDivergence(fd2) should be < eps + + val fd3 = FeatureDistribution("A", None, 10, 1, Array(0, 0, 1000, 1000, 0), Array.empty) + fd3.jsDivergence(fd3) should be < eps + val fd4 = FeatureDistribution("A", None, 20, 20, Array(200, 800, 0, 0, 1200), Array.empty) + (fd3.jsDivergence(fd4) - 1.0) should be < eps + } +} diff --git a/core/src/test/scala/com/salesforce/op/filters/FiltersTestData.scala b/core/src/test/scala/com/salesforce/op/filters/FiltersTestData.scala new file mode 100644 index 0000000000..ee4c835ae0 --- /dev/null +++ b/core/src/test/scala/com/salesforce/op/filters/FiltersTestData.scala @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2017, Salesforce.com, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of Salesforce.com nor the names of its contributors may + * be used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +package com.salesforce.op.filters + +trait FiltersTestData { + + protected val eps = 1E-2 + + protected val trainSummaries = Seq( + FeatureDistribution("A", None, 10, 1, Array(1, 4, 0, 0, 6), Array.empty), + FeatureDistribution("B", None, 20, 20, Array(2, 8, 0, 0, 12), Array.empty), + FeatureDistribution("C", Some("1"), 10, 1, Array(1, 4, 0, 0, 6), Array.empty), + FeatureDistribution("C", Some("2"), 20, 19, Array(2, 8, 0, 0, 12), Array.empty), + FeatureDistribution("D", Some("1"), 10, 9, Array(1, 4, 0, 0, 6), Array.empty), + FeatureDistribution("D", Some("2"), 20, 19, Array(2, 8, 0, 0, 12), Array.empty) + ) + + protected val scoreSummaries = Seq( + FeatureDistribution("A", None, 10, 8, Array(1, 4, 0, 0, 6), Array.empty), + FeatureDistribution("B", None, 20, 20, Array(2, 8, 0, 0, 12), Array.empty), + FeatureDistribution("C", Some("1"), 10, 1, Array(0, 0, 10, 10, 0), Array.empty), + FeatureDistribution("C", Some("2"), 20, 19, Array(2, 8, 0, 0, 12), Array.empty), + FeatureDistribution("D", Some("1"), 0, 0, Array(0, 0, 0, 0, 0), Array.empty), + FeatureDistribution("D", Some("2"), 0, 0, Array(0, 0, 0, 0, 0), Array.empty) + ) +} diff --git a/core/src/test/scala/com/salesforce/op/filters/PreparedFeaturesTest.scala b/core/src/test/scala/com/salesforce/op/filters/PreparedFeaturesTest.scala new file mode 100644 index 0000000000..0f80a75302 --- /dev/null +++ b/core/src/test/scala/com/salesforce/op/filters/PreparedFeaturesTest.scala @@ -0,0 +1,173 @@ +/* + * Copyright (c) 2017, Salesforce.com, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of Salesforce.com nor the names of its contributors may + * be used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +package com.salesforce.op.filters + +import scala.math.round + +import com.salesforce.op.stages.impl.preparators.CorrelationType +import com.salesforce.op.test.TestSparkContext +import com.twitter.algebird.Monoid._ +import com.twitter.algebird.Operators._ +import org.apache.spark.mllib.linalg.{Matrix, Vector} +import org.apache.spark.mllib.stat.Statistics +import org.apache.spark.rdd.RDD +import org.junit.runner.RunWith +import org.scalatest.FlatSpec +import org.scalatest.junit.JUnitRunner + +@RunWith(classOf[JUnitRunner]) +class PreparedFeaturesTest extends FlatSpec with TestSparkContext { + + val responseKey1: FeatureKey = "Response1" -> None + val responseKey2: FeatureKey = "Response2" -> None + val predictorKey1: FeatureKey = "Predictor1" -> None + val predictorKey2A: FeatureKey = "Predictor2" -> Option("A") + val predictorKey2B: FeatureKey = "Predictor2" -> Option("B") + + val preparedFeatures1 = PreparedFeatures( + responses = Map(responseKey1 -> Right(Seq(1.0)), responseKey2 -> Right(Seq(0.5))), + predictors = Map( + predictorKey1 -> Right(Seq(0.0, 0.0)), + predictorKey2A -> Left(Seq("i", "ii")), + predictorKey2B -> Left(Seq("iii")))) + val preparedFeatures2 = PreparedFeatures( + responses = Map(responseKey1 -> Right(Seq(0.0))), + predictors = Map(predictorKey1 -> Right(Seq(0.4, 0.5)))) + val preparedFeatures3 = PreparedFeatures( + responses = Map(responseKey2 -> Right(Seq(-0.5))), + predictors = Map(predictorKey2A -> Left(Seq("iv")))) + val allPreparedFeatures = Seq(preparedFeatures1, preparedFeatures2, preparedFeatures3) + val (allResponseSummaries, allPredictorSummaries) = allPreparedFeatures.map(_.summaries).reduce(_ + _) + + val allResponseKeys1 = Array(responseKey1, responseKey2) + val allResponseKeys2 = Array(responseKey1) + val allPredictorKeys1 = Array(predictorKey1, predictorKey2A, predictorKey2B) + val allPredictorKeys2 = Array(predictorKey1) + + Spec[PreparedFeatures] should "produce correct summaries" in { + val (responseSummaries1, predictorSummaries1) = preparedFeatures1.summaries + val (responseSummaries2, predictorSummaries2) = preparedFeatures2.summaries + val (responseSummaries3, predictorSummaries3) = preparedFeatures3.summaries + + responseSummaries1 should contain theSameElementsAs + Seq(responseKey1 -> Summary(1.0, 1.0), responseKey2 -> Summary(0.5, 0.5)) + predictorSummaries1 should contain theSameElementsAs + Seq(predictorKey1 -> Summary(0.0, 0.0), predictorKey2A -> Summary(2.0, 2.0), predictorKey2B -> Summary(1.0, 1.0)) + responseSummaries2 should contain theSameElementsAs + Seq(responseKey1 -> Summary(0.0, 0.0)) + predictorSummaries2 should contain theSameElementsAs + Seq(predictorKey1 -> Summary(0.4, 0.5)) + responseSummaries3 should contain theSameElementsAs + Seq(responseKey2 -> Summary(-0.5, -0.5)) + predictorSummaries3 should contain theSameElementsAs + Seq(predictorKey2A -> Summary(1.0, 1.0)) + allResponseSummaries should contain theSameElementsAs + Seq(responseKey1 -> Summary(0.0, 1.0), responseKey2 -> Summary(-0.5, 0.5)) + allPredictorSummaries should contain theSameElementsAs + Seq(predictorKey1 -> Summary(0.0, 0.5), predictorKey2A -> Summary(1.0, 2.0), predictorKey2B -> Summary(1.0, 1.0)) + } + + it should "produce correct null-label leakage vector with single response" in { + preparedFeatures1.getNullLabelLeakageVector(allResponseKeys2, allPredictorKeys1).toArray shouldEqual + Array(1.0, 0.0, 0.0, 0.0) + + preparedFeatures2.getNullLabelLeakageVector(allResponseKeys2, allPredictorKeys1).toArray shouldEqual + Array(0.0, 0.0, 1.0, 1.0) + + preparedFeatures3.getNullLabelLeakageVector(allResponseKeys2, allPredictorKeys1).toArray shouldEqual + Array(0.0, 1.0, 0.0, 1.0) + } + + it should "produce correct null-label leakage vector with multiple responses" in { + preparedFeatures1.getNullLabelLeakageVector(allResponseKeys1, allPredictorKeys1).toArray shouldEqual + Array(1.0, 0.5, 0.0, 0.0, 0.0) + + preparedFeatures2.getNullLabelLeakageVector(allResponseKeys1, allPredictorKeys1).toArray shouldEqual + Array(0.0, 0.0, 0.0, 1.0, 1.0) + + preparedFeatures3.getNullLabelLeakageVector(allResponseKeys1, allPredictorKeys1).toArray shouldEqual + Array(0.0, -0.5, 1.0, 0.0, 1.0) + } + + it should "produce correct null-label leakage Pearson correlation matrix with multiple responses" in { + val expected = Seq( + Array(1.0, 0.87, -0.5, -0.5, -1.0), + Array(1.0, -0.87, 0.0, -0.87), + Array(1.0, -0.5, 0.5), + Array(1.0, 0.5), + Array(1.0)) + testCorrMatrix(allResponseKeys1, CorrelationType.Pearson, expected) + } + + it should "produce correct null-label leakage Spearman correlation matrix with multiple responses" in { + val expected = Seq( + Array(1.0, 0.87, -0.5, -0.5, -1.0), + Array(1.0, -0.87, 0.0, -0.87), + Array(1.0, -0.5, 0.5), + Array(1.0, 0.5), + Array(1.0)) + testCorrMatrix(allResponseKeys1, CorrelationType.Spearman, expected) + } + + it should "produce correct null-label leakage Pearson correlation matrix with single response" in { + val expected = Seq( + Array(1.0, -0.5, -0.5, -1.0), + Array(1.0, -0.5, 0.5), + Array(1.0, 0.5), + Array(1.0)) + testCorrMatrix(allResponseKeys2, CorrelationType.Pearson, expected) + } + + it should "produce correct null-label leakage Spearman correlation matrix with single response" in { + val expected = Seq( + Array(1.0, -0.5, -0.5, -1.0), + Array(1.0, -0.5, 0.5), + Array(1.0, 0.5), + Array(1.0)) + testCorrMatrix(allResponseKeys2, CorrelationType.Spearman, expected) + } + + def testCorrMatrix( + responseKeys: Array[FeatureKey], + correlationType: CorrelationType, + expectedResult: Seq[Array[Double]] + ): Unit = { + val corrRDD = + sc.parallelize(allPreparedFeatures.map(_.getNullLabelLeakageVector(responseKeys, allPredictorKeys1))) + val corrMatrix = Statistics.corr(corrRDD, correlationType.sparkName) + + corrMatrix.colIter.zipWithIndex.map { case(vec, idx) => + // It's symmetric, so can drop based on index + vec.toArray.drop(idx).map(BigDecimal(_).setScale(2, BigDecimal.RoundingMode.HALF_UP).toDouble) + }.toSeq should contain theSameElementsInOrderAs expectedResult + } +} diff --git a/core/src/test/scala/com/salesforce/op/filters/RawFeatureFilterTest.scala b/core/src/test/scala/com/salesforce/op/filters/RawFeatureFilterTest.scala index add784ec66..6930995887 100644 --- a/core/src/test/scala/com/salesforce/op/filters/RawFeatureFilterTest.scala +++ b/core/src/test/scala/com/salesforce/op/filters/RawFeatureFilterTest.scala @@ -33,164 +33,63 @@ package com.salesforce.op.filters import com.salesforce.op.OpParams import com.salesforce.op.features.{OPFeature, TransientFeature} +import com.salesforce.op.readers.DataFrameFieldNames import com.salesforce.op.stages.impl.feature.HashAlgorithm -import com.salesforce.op.test.PassengerSparkFixtureTest +import com.salesforce.op.test.{Passenger, PassengerSparkFixtureTest} import com.salesforce.op.utils.spark.RichDataset._ +import com.twitter.algebird.Operators._ import org.apache.spark.mllib.feature.HashingTF +import org.apache.spark.sql.DataFrame import org.junit.runner.RunWith import org.scalatest.FlatSpec import org.scalatest.junit.JUnitRunner @RunWith(classOf[JUnitRunner]) -class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest { - - private val eps = 1E-2 - - private val trainSummaries = Seq( - FeatureDistrib("A", None, 10, 1, Array(1, 4, 0, 0, 6), Array.empty), - FeatureDistrib("B", None, 20, 20, Array(2, 8, 0, 0, 12), Array.empty), - FeatureDistrib("C", Some("1"), 10, 1, Array(1, 4, 0, 0, 6), Array.empty), - FeatureDistrib("C", Some("2"), 20, 19, Array(2, 8, 0, 0, 12), Array.empty), - FeatureDistrib("D", Some("1"), 10, 9, Array(1, 4, 0, 0, 6), Array.empty), - FeatureDistrib("D", Some("2"), 20, 19, Array(2, 8, 0, 0, 12), Array.empty) - ) - - private val scoreSummaries = Seq( - FeatureDistrib("A", None, 10, 8, Array(1, 4, 0, 0, 6), Array.empty), - FeatureDistrib("B", None, 20, 20, Array(2, 8, 0, 0, 12), Array.empty), - FeatureDistrib("C", Some("1"), 10, 1, Array(0, 0, 10, 10, 0), Array.empty), - FeatureDistrib("C", Some("2"), 20, 19, Array(2, 8, 0, 0, 12), Array.empty), - FeatureDistrib("D", Some("1"), 0, 0, Array(0, 0, 0, 0, 0), Array.empty), - FeatureDistrib("D", Some("2"), 0, 0, Array(0, 0, 0, 0, 0), Array.empty) - ) - - Spec[Summary] should "be correctly created from a sequence of features" in { - val f1 = Left(Seq("a", "b", "c")) - val f2 = Right(Seq(0.5, 1.0)) - val f1s = Summary(f1) - val f2s = Summary(f2) - f1s.min shouldBe 3 - f1s.max shouldBe 3 - f2s.min shouldBe 0.5 - f2s.max shouldBe 1.0 - } - - Spec[FeatureDistrib] should "be correctly created for features" in { - val features = Array(survived, age, gender, height, weight).map(TransientFeature.apply) - val values: Array[(Boolean, FeatureDistrib.ProcessedSeq)] = Array( - (false, Right(Seq(1.0))), (true, Right(Seq.empty[Double])), (false, Left(Seq("male", "female"))), - (true, Left(Seq.empty[String])), (false, Right(Seq(1.0, 3.0, 5.0))) - ) - val summary = Array(Summary(0.0, 1.0), Summary(-1.6, 10.6), Summary(0.0, 3.0), Summary(0.0, 0.0), Summary(1.0, 5.0)) - val bins = 10 - val hasher: HashingTF = new HashingTF(numFeatures = bins) - .setBinary(false) - .setHashAlgorithm(HashAlgorithm.MurMur3.toString.toLowerCase) - - val distribs = FeatureDistrib.getDistributions(features, values, summary, bins, hasher) - distribs.foreach{ d => - d.key shouldBe None - d.count shouldBe 1 - d.distribution.length shouldBe bins - } - distribs(0).nulls shouldBe 0 - distribs(1).nulls shouldBe 1 - distribs(1).distribution.sum shouldBe 0 - distribs(2).distribution.sum shouldBe 2 - distribs(2).summaryInfo should contain theSameElementsAs Array(0.0, 3.0) - distribs(3).distribution.sum shouldBe 0 - distribs(4).distribution.sum shouldBe 3 - distribs(4).summaryInfo.length shouldBe bins - } - - it should "be correctly created for map features" in { - val features = Array(stringMap, numericMap, booleanMap).map(TransientFeature.apply) - val values: Array[Map[String, FeatureDistrib.ProcessedSeq]] = Array( - Map("A" -> Left(Seq("male", "female"))), - Map("A" -> Right(Seq(1.0)), "B" -> Right(Seq(1.0))), - Map("B" -> Right(Seq(0.0)))) - val summary = Array( - Map("A" -> Summary(0.0, 1.0), "B" -> Summary(0.0, 5.0)), - Map("A" -> Summary(-1.6, 10.6), "B" -> Summary(0.0, 3.0)), - Map("B" -> Summary(0.0, 0.0))) - val bins = 10 - val hasher: HashingTF = new HashingTF(numFeatures = bins) - .setBinary(false) - .setHashAlgorithm(HashAlgorithm.MurMur3.toString.toLowerCase) - - val distribs = FeatureDistrib.getMapDistributions(features, values, summary, bins, hasher) - distribs.length shouldBe 5 - distribs.foreach{ d => - d.key.contains("A") || d.key.contains("B") shouldBe true - d.count shouldBe 1 - if (d.name != "booleanMap") d.distribution.length shouldBe bins - else d.distribution.length shouldBe 2 - } - distribs(0).nulls shouldBe 0 - distribs(0).summaryInfo should contain theSameElementsAs Array(0.0, 1.0) - distribs(1).nulls shouldBe 1 - distribs(0).distribution.sum shouldBe 2 - distribs(1).distribution.sum shouldBe 0 - distribs(2).summaryInfo.length shouldBe bins - distribs(2).distribution.sum shouldBe 1 - distribs(4).distribution(0) shouldBe 1 - distribs(4).distribution(1) shouldBe 0 - distribs(4).summaryInfo.length shouldBe 2 - } - - it should "correctly compare fill rates" in { - val fd1 = FeatureDistrib("A", None, 10, 1, Array.empty, Array.empty) - val fd2 = FeatureDistrib("A", None, 20, 20, Array.empty, Array.empty) - fd1.relativeFillRate(fd2) shouldBe 0.9 - } - - it should "correctly compare relative fill rates" in { - val fd1 = FeatureDistrib("A", None, 10, 1, Array.empty, Array.empty) - val fd2 = FeatureDistrib("A", None, 20, 19, Array.empty, Array.empty) - trainSummaries(0).relativeFillRatio(scoreSummaries(0)) shouldBe 4.5 - trainSummaries(2).relativeFillRatio(scoreSummaries(2)) shouldBe 1.0 - fd1.relativeFillRatio(fd2) shouldBe 18.0 - } - - it should "correctly compute the DS divergence" in { - val fd1 = FeatureDistrib("A", None, 10, 1, Array(1, 4, 0, 0, 6), Array.empty) - val fd2 = FeatureDistrib("A", None, 20, 20, Array(2, 8, 0, 0, 12), Array.empty) - fd1.jsDivergence(fd2) should be < eps - - val fd3 = FeatureDistrib("A", None, 10, 1, Array(0, 0, 1000, 1000, 0), Array.empty) - fd3.jsDivergence(fd3) should be < eps - val fd4 = FeatureDistrib("A", None, 20, 20, Array(200, 800, 0, 0, 1200), Array.empty) - (fd3.jsDivergence(fd4) - 1.0) should be < eps - } - +class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with FiltersTestData { Spec[RawFeatureFilter[_]] should "compute feature stats correctly" in { val features: Array[OPFeature] = Array(survived, age, gender, height, weight, description, boarded, stringMap, numericMap, booleanMap) - val filter = new RawFeatureFilter(simpleReader, Some(dataReader), 10, 0.1, 0.8, Double.PositiveInfinity, 0.7) - val summaries = filter.computeFeatureStats(passengersDataSet, features) + val filter = new RawFeatureFilter(simpleReader, Some(dataReader), 10, 0.1, 0.8, Double.PositiveInfinity, 0.7, 1.0) + val allFeatureInfo = filter.computeFeatureStats(passengersDataSet, features) - summaries.featureSummaries.size shouldBe 7 - summaries.mapFeatureSummaries.size shouldBe 3 - summaries.featureDistributions.size shouldBe 13 + allFeatureInfo.responseSummaries.size shouldBe 1 + allFeatureInfo.responseSummaries.headOption.map(_._2) shouldEqual Option(Summary(0, 1)) + allFeatureInfo.responseDistributions.size shouldBe 1 + allFeatureInfo.predictorSummaries.size shouldBe 12 + allFeatureInfo.predictorDistributions.size shouldBe 12 - val surv = summaries.featureDistributions(0) + val surv = allFeatureInfo.responseDistributions(0) surv.name shouldBe survived.name surv.key shouldBe None surv.count shouldBe 6 surv.nulls shouldBe 4 surv.distribution.sum shouldBe 2 - val strMapF = summaries.featureDistributions(7) + + val ageF = allFeatureInfo.predictorDistributions.filter(_.name == age.name)(0) + ageF.name shouldBe age.name + ageF.key shouldBe None + ageF.count shouldBe 6 + ageF.nulls shouldBe 2 + ageF.distribution.sum shouldBe 4 + + val strMapF = + allFeatureInfo.predictorDistributions.filter(d => d.name == stringMap.name && d.key == Option("Female"))(0) + strMapF.name shouldBe stringMap.name if (strMapF.key.contains("Female")) strMapF.nulls shouldBe 3 else strMapF.nulls shouldBe 4 - val strMapM = summaries.featureDistributions(8) + + val strMapM = + allFeatureInfo.predictorDistributions.filter(d => d.name == stringMap.name && d.key == Option("Male"))(0) + strMapM.name shouldBe stringMap.name if (strMapM.key.contains("Male")) strMapM.nulls shouldBe 4 else strMapM.nulls shouldBe 3 } it should "correctly determine which features to exclude based on the stats of training fill rate" in { // only fill rate matters - val filter = new RawFeatureFilter(simpleReader, Some(dataReader), 10, 0.2, 1.0, Double.PositiveInfinity, 1.0) - val (excludedTrainF, excludedTrainMK) = filter.getFeaturesToExclude(trainSummaries, Seq.empty) + val filter = new RawFeatureFilter(simpleReader, Some(dataReader), 10, 0.2, 1.0, Double.PositiveInfinity, 1.0, 1.0) + val (excludedTrainF, excludedTrainMK) = + filter.getFeaturesToExclude(trainSummaries, Seq.empty, Map.empty) excludedTrainF.toSet shouldEqual Set("B", "D") excludedTrainMK.keySet shouldEqual Set("C") excludedTrainMK.head._2 shouldEqual Set("2") @@ -199,8 +98,9 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest { it should "correctly determine which features to exclude based on the stats of training and scoring fill rate" in { // only fill rate matters - val filter = new RawFeatureFilter(simpleReader, Some(dataReader), 10, 0.2, 1.0, Double.PositiveInfinity, 1.0) - val (excludedBothF, excludedBothMK) = filter.getFeaturesToExclude(trainSummaries, scoreSummaries) + val filter = new RawFeatureFilter(simpleReader, Some(dataReader), 10, 0.2, 1.0, Double.PositiveInfinity, 1.0, 1.0) + val (excludedBothF, excludedBothMK) = + filter.getFeaturesToExclude(trainSummaries, scoreSummaries, Map.empty) excludedBothF.toSet shouldEqual Set("B", "D") excludedBothMK.keySet shouldEqual Set("C") excludedBothMK.head._2 shouldEqual Set("2") @@ -208,24 +108,27 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest { it should "correctly determine which features to exclude based on the stats of relative fill rate" in { // relative fill rate matters - val filter2 = new RawFeatureFilter(simpleReader, Some(dataReader), 10, 0.0, 0.5, Double.PositiveInfinity, 1.0) - val (excludedBothRelF, excludedBothRelMK) = filter2.getFeaturesToExclude(trainSummaries, scoreSummaries) + val filter2 = new RawFeatureFilter(simpleReader, Some(dataReader), 10, 0.0, 0.5, Double.PositiveInfinity, 1.0, 1.0) + val (excludedBothRelF, excludedBothRelMK) = + filter2.getFeaturesToExclude(trainSummaries, scoreSummaries, Map.empty) excludedBothRelF.toSet shouldEqual Set("A") excludedBothRelMK.isEmpty shouldBe true } it should "correctly determine which features to exclude based on the stats of fill rate ratio" in { // relative fill ratio matters - val filter4 = new RawFeatureFilter(simpleReader, Some(dataReader), 10, 0.0, 1.0, 2.0, 1.0) - val (excludedBothRelFR, excludedBothRelMKR) = filter4.getFeaturesToExclude(trainSummaries, scoreSummaries) + val filter4 = new RawFeatureFilter(simpleReader, Some(dataReader), 10, 0.0, 1.0, 2.0, 1.0, 1.0) + val (excludedBothRelFR, excludedBothRelMKR) = + filter4.getFeaturesToExclude(trainSummaries, scoreSummaries, Map.empty) excludedBothRelFR.toSet shouldEqual Set("D", "A", "B") excludedBothRelMKR.isEmpty shouldBe true } it should "correctly determine which features to exclude based on the stats of js distance" in { // js distance - val filter3 = new RawFeatureFilter(simpleReader, Some(dataReader), 10, 0.0, 1.0, Double.PositiveInfinity, 0.5) - val (excludedBothDistF, excludedBothDistMK) = filter3.getFeaturesToExclude(trainSummaries, scoreSummaries) + val filter3 = new RawFeatureFilter(simpleReader, Some(dataReader), 10, 0.0, 1.0, Double.PositiveInfinity, 0.5, 1.0) + val (excludedBothDistF, excludedBothDistMK) = + filter3.getFeaturesToExclude(trainSummaries, scoreSummaries, Map.empty) excludedBothDistF.isEmpty shouldEqual true excludedBothDistMK.keySet shouldEqual Set("C") excludedBothDistMK.head._2 shouldEqual Set("1") @@ -233,8 +136,9 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest { it should "correctly determine which features to exclude based on all the stats" in { // all - val filter4 = new RawFeatureFilter(simpleReader, Some(dataReader), 10, 0.1, 0.5, Double.PositiveInfinity, 0.5) - val (excludedBothAllF, excludedBothAllMK) = filter4.getFeaturesToExclude(trainSummaries, scoreSummaries) + val filter4 = new RawFeatureFilter(simpleReader, Some(dataReader), 10, 0.1, 0.5, Double.PositiveInfinity, 0.5, 1.0) + val (excludedBothAllF, excludedBothAllMK) = + filter4.getFeaturesToExclude(trainSummaries, scoreSummaries, Map.empty) excludedBothAllF.toSet shouldEqual Set("A", "B", "C", "D") excludedBothAllMK.isEmpty shouldBe true } @@ -244,12 +148,12 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest { val survPred = survived.copy(isResponse = false) val features: Array[OPFeature] = Array(survPred, age, gender, height, weight, description, boarded, stringMap, numericMap, booleanMap) - val filter = new RawFeatureFilter(dataReader, Some(simpleReader), 10, 0.0, 1.0, Double.PositiveInfinity, 1.0) + val filter = new RawFeatureFilter(dataReader, Some(simpleReader), 10, 0.0, 1.0, Double.PositiveInfinity, 1.0, 1.0) val (df, toDrop) = filter.generateFilteredRaw(features, params) toDrop.isEmpty shouldBe true df.schema.fields should contain theSameElementsAs passengersDataSet.schema.fields - val filter1 = new RawFeatureFilter(dataReader, Some(simpleReader), 10, 0.5, 0.5, Double.PositiveInfinity, 1.0) + val filter1 = new RawFeatureFilter(dataReader, Some(simpleReader), 10, 0.5, 0.5, Double.PositiveInfinity, 1.0, 1.0) val (df1, toDrop1) = filter1.generateFilteredRaw(features, params) toDrop1 should contain theSameElementsAs Array(survPred) df1.schema.fields.exists(_.name == survPred.name) shouldBe false @@ -260,7 +164,7 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest { val params = new OpParams() val features: Array[OPFeature] = Array(survived, age, gender, height, weight, description, boarded, stringMap, numericMap, booleanMap) - val filter = new RawFeatureFilter(dataReader, Some(simpleReader), 10, 0.5, 0.5, Double.PositiveInfinity, 1.0) + val filter = new RawFeatureFilter(dataReader, Some(simpleReader), 10, 0.5, 0.5, Double.PositiveInfinity, 1.0, 1.0) val (df, toDrop) = filter.generateFilteredRaw(features, params) toDrop.isEmpty shouldBe true df.schema.fields should contain theSameElementsAs passengersDataSet.schema.fields @@ -272,14 +176,95 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest { val params = new OpParams() val features: Array[OPFeature] = Array(survived, age, gender, height, weight, description, boarded) - val filter = new RawFeatureFilter(dataReader, Some(simpleReader), 10, 0.1, 0.1, 2, 0.2) + val filter = new RawFeatureFilter(dataReader, Some(simpleReader), 10, 0.1, 0.1, 2, 0.2, 0.9) val (df, toDrop) = filter.generateFilteredRaw(features, params) toDrop.toSet shouldEqual Set(age, gender, height, weight, description, boarded) - df.schema.fields.map(_.name) should contain theSameElementsAs Array("key", "survived") + df.schema.fields.map(_.name) should contain theSameElementsAs + Array(DataFrameFieldNames.KeyFieldName, survived.name) - val filter2 = new RawFeatureFilter(dataReader, Some(simpleReader), 10, 0.1, 0.1, 2, 0.2, Set("age", "gender")) + val filter2 = new RawFeatureFilter(dataReader, Some(simpleReader), 10, 0.1, 0.1, 2, 0.2, 0.9, + protectedFeatures = Set(age.name, gender.name)) val (df2, toDrop2) = filter2.generateFilteredRaw(features, params) toDrop2.toSet shouldEqual Set(height, weight, description, boarded) - df2.schema.fields.map(_.name) should contain theSameElementsAs Array("key", "survived", "age", "gender") + df2.schema.fields.map(_.name) should contain theSameElementsAs + Array(DataFrameFieldNames.KeyFieldName, survived.name, age.name, gender.name) + } + + it should "not drop JS divergence-protected features based on JS divergence check" in { + val params = new OpParams() + val features: Array[OPFeature] = + Array(survived, age, gender, height, weight, description, boarded, boardedTime, boardedTimeAsDateTime) + val filter = new RawFeatureFilter( + trainingReader = dataReader, + scoreReader = Some(simpleReader), + bins = 10, + minFill = 0.0, + maxFillDifference = 1.0, + maxFillRatioDiff = Double.PositiveInfinity, + maxJSDivergence = 0.0, + maxCorrelation = 1.0, + jsDivergenceProtectedFeatures = Set(boardedTime.name, boardedTimeAsDateTime.name)) + + val (df, toDrop) = filter.generateFilteredRaw(features, params) + toDrop.toSet shouldEqual Set(age, gender, height, weight, description, boarded) + df.schema.fields.map(_.name) should contain theSameElementsAs + Seq(DataFrameFieldNames.KeyFieldName, survived.name, boardedTime.name, boardedTimeAsDateTime.name) + } + + it should "correctly drop features based on null-label leakage correlation greater than 0.9" in { + val expectedDropped = Seq(boarded, weight, gender) + val expectedMapKeys = Seq("Female", "Male") + nullLabelCorrelationTest(0.9, expectedDropped, expectedMapKeys) + } + + it should "correctly drop features based on null-label leakage correlation greater than 0.6" in { + val expectedDropped = Seq(boarded, weight, gender, age) + val expectedMapKeys = Seq("Female", "Male") + nullLabelCorrelationTest(0.6, expectedDropped, expectedMapKeys) + } + + it should "correctly drop features based on null-label leakage correlation greater than 0.4" in { + val expectedDropped = Seq(boarded, weight, gender, age, description) + val expectedMapKeys = Seq("Male") + nullLabelCorrelationTest(0.4, expectedDropped, expectedMapKeys) + } + + it should "correctly drop features based on null-label leakage correlation greater than 0.3" in { + val expectedDropped = Seq(boarded, weight, gender, age, description, booleanMap, numericMap, stringMap) + nullLabelCorrelationTest(0.3, expectedDropped, Seq()) + } + + private def nullLabelCorrelationTest( + maxCorrelation: Double, + expectedDropped: Seq[OPFeature], + expectedMapKeys: Seq[String] + ): Unit = { + def getFilter(maxCorrelation: Double): RawFeatureFilter[Passenger] = new RawFeatureFilter( + trainingReader = dataReader, + scoreReader = Some(simpleReader), + bins = 10, + minFill = 0.0, + maxFillDifference = 1.0, + maxFillRatioDiff = Double.PositiveInfinity, + maxJSDivergence = 1.0, + maxCorrelation = maxCorrelation) + + val params = new OpParams() + val features: Array[OPFeature] = + Array(survived, age, gender, height, weight, description, boarded, stringMap, numericMap, booleanMap) + val (df, dropped) = getFilter(maxCorrelation).generateFilteredRaw(features, params) + + dropped should contain theSameElementsAs expectedDropped.toSeq + df.schema.fields.map(_.name) should contain theSameElementsAs + DataFrameFieldNames.KeyFieldName +: features.diff(dropped).map(_.name) + if (expectedMapKeys.nonEmpty) { + df.collect(booleanMap).map(_.value.keySet).reduce(_ + _) should contain theSameElementsAs expectedMapKeys + df.collect(numericMap).map(_.value.keySet).reduce(_ + _) should contain theSameElementsAs expectedMapKeys + df.collect(stringMap).map(_.value.keySet).reduce(_ + _) should contain theSameElementsAs expectedMapKeys + } else { + intercept[IllegalArgumentException] { df.collect(booleanMap) } + intercept[IllegalArgumentException] { df.collect(numericMap) } + intercept[IllegalArgumentException] { df.collect(stringMap) } + } } } diff --git a/core/src/test/scala/com/salesforce/op/stages/base/ternary/TernaryTransformerTest.scala b/core/src/test/scala/com/salesforce/op/filters/SummaryTest.scala similarity index 76% rename from core/src/test/scala/com/salesforce/op/stages/base/ternary/TernaryTransformerTest.scala rename to core/src/test/scala/com/salesforce/op/filters/SummaryTest.scala index ba364d439d..4bbb47b5ee 100644 --- a/core/src/test/scala/com/salesforce/op/stages/base/ternary/TernaryTransformerTest.scala +++ b/core/src/test/scala/com/salesforce/op/filters/SummaryTest.scala @@ -29,21 +29,23 @@ * POSSIBILITY OF SUCH DAMAGE. */ -package com.salesforce.op.stages.base.ternary +package com.salesforce.op.filters -import com.salesforce.op.features.types.Text -import com.salesforce.op.test._ -import org.apache.spark.ml.param.ParamMap +import com.salesforce.op.test.TestCommon import org.junit.runner.RunWith +import org.scalatest.FlatSpec import org.scalatest.junit.JUnitRunner -import org.scalatest.{FlatSpec, Matchers} @RunWith(classOf[JUnitRunner]) -class TernaryTransformerTest extends FlatSpec with TestCommon { - - Spec[TernaryLambdaTransformer[_, _, _, _]] should "copy successfully" in { - val tr = new TernaryLambdaTransformer[Text, Text, Text, Text](operationName = "foo", transformFn = (x, y, z) => x) - tr.copy(new ParamMap()).uid shouldBe tr.uid +class SummaryTest extends FlatSpec with TestCommon { + Spec[Summary] should "be correctly created from a sequence of features" in { + val f1 = Left(Seq("a", "b", "c")) + val f2 = Right(Seq(0.5, 1.0)) + val f1s = Summary(f1) + val f2s = Summary(f2) + f1s.min shouldBe 3 + f1s.max shouldBe 3 + f2s.min shouldBe 0.5 + f2s.max shouldBe 1.0 } - } diff --git a/core/src/test/scala/com/salesforce/op/stages/base/ternary/TernaryEstimatorTest.scala b/core/src/test/scala/com/salesforce/op/stages/base/ternary/TernaryEstimatorTest.scala deleted file mode 100644 index 51faecf92c..0000000000 --- a/core/src/test/scala/com/salesforce/op/stages/base/ternary/TernaryEstimatorTest.scala +++ /dev/null @@ -1,145 +0,0 @@ -/* - * Copyright (c) 2017, Salesforce.com, Inc. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - * - * 3. Neither the name of Salesforce.com nor the names of its contributors may - * be used to endorse or promote products derived from this software without - * specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ - -package com.salesforce.op.stages.base.ternary - -import breeze.numerics.abs -import com.salesforce.op.UID -import com.salesforce.op.features.Feature -import com.salesforce.op.features.types._ -import com.salesforce.op.test.PassengerSparkFixtureTest -import com.salesforce.op.utils.spark.RichDataset._ -import org.apache.spark.ml.param.ParamMap -import org.apache.spark.sql.Dataset -import org.apache.spark.sql.types._ -import org.junit.runner.RunWith -import org.scalatest.FlatSpec -import org.scalatest.junit.JUnitRunner - - -@RunWith(classOf[JUnitRunner]) -class TernaryEstimatorTest extends FlatSpec with PassengerSparkFixtureTest { - - val testEstimator: TernaryEstimator[MultiPickList, Binary, RealMap, Real] = new TripleInteractionsEstimator() - - Spec[TernaryEstimator[_, _, _, _]] should "error if you try to get the output without setting the inputs" in { - intercept[java.util.NoSuchElementException](testEstimator.getOutput()) - } - - it should "return a single output feature of the correct type" in { - val outputFeatures = testEstimator.setInput(gender, survived, numericMap).getOutput() - outputFeatures shouldBe new Feature[Real]( - name = testEstimator.getOutputFeatureName, - originStage = testEstimator, - isResponse = true, - parents = Array(gender, survived, numericMap) - ) - } - - it should "return a TernaryModel with the estimator as the parent and the correct function" in { - val testModel = testEstimator.setInput(gender, survived, numericMap).fit(passengersDataSet) - - testModel.parent shouldBe testEstimator - abs( - testModel.transformFn(Seq("male").toMultiPickList, false.toBinary, Map("male" -> 1.2).toRealMap).value.get - 0.0 - ) should be < 0.000000002 - - testModel.transformFn(Seq("male").toMultiPickList, true.toBinary, Map("male" -> 1.2).toRealMap).value shouldBe None - abs( - testModel.transformFn(Seq("male").toMultiPickList, false.toBinary, Map("male" -> 2.2).toRealMap).value.get - 1.0 - ) should be < 0.000000002 - } - - it should "create a TernaryModel that uses the specified transform function when fit" in { - val testModel = testEstimator.setInput(gender, survived, numericMap).fit(passengersDataSet) - val testDataTransformed = testModel.setInput(gender, survived, numericMap) - .transform(passengersDataSet.select(gender.name, survived.name, numericMap.name)) - - testDataTransformed.schema shouldEqual StructType( - Seq(StructField(gender.name, ArrayType(StringType, true), true), - StructField(survived.name, BooleanType, true), - StructField(numericMap.name, MapType(StringType, DoubleType, true), true), - StructField(testEstimator.getOutputFeatureName, DoubleType, true))) - - testDataTransformed.collect(gender, survived, numericMap, testModel.getOutput()) shouldEqual Array( - (Set("Male").toMultiPickList, false.toBinary, new RealMap(Map("Male" -> 2.0)), 0.8.toReal), - (Seq().toMultiPickList, true.toBinary, new RealMap(Map()), new Real(None)), - (Set("Female").toMultiPickList, new Binary(None), Map("Female" -> 1.0).toRealMap, new Real(-0.19999999999999996)), - (Set("Female").toMultiPickList, new Binary(None), Map("Female" -> 1.0).toRealMap, new Real(-0.19999999999999996)), - (Set("Male").toMultiPickList, new Binary(None), Map("Male" -> 1.0).toRealMap, new Real(-0.19999999999999996)), - (Set("Female").toMultiPickList, new Binary(None), Map("Female" -> 1.0).toRealMap, new Real(-0.19999999999999996)) - ) - } - - it should "copy itself and the model successfully" in { - val est = new TripleInteractionsEstimator() - val mod = new TripleInteractionsModel(0.0, est.operationName, est.uid) - - est.copy(new ParamMap()).uid shouldBe est.uid - mod.copy(new ParamMap()).uid shouldBe mod.uid - } - -} - - -class TripleInteractionsEstimator(uid: String = UID[TripleInteractionsEstimator]) - extends TernaryEstimator[MultiPickList, Binary, RealMap, Real](operationName = "tripleInteractions", uid = uid) - with TripleInteractions { - - // scalastyle:off line.size.limit - def fitFn(dataset: Dataset[(MultiPickList#Value, Binary#Value, RealMap#Value)]): TernaryModel[MultiPickList, Binary, RealMap, Real] = { - import dataset.sparkSession.implicits._ - val mean = { - dataset.map { case (gndr, srvvd, nmrcMp) => - if (survivedAndMatches(gndr, srvvd, nmrcMp)) nmrcMp(gndr.head) else 0.0 - }.filter(_ != 0.0).groupBy().mean().first().getDouble(0) - } - new TripleInteractionsModel(mean = mean, operationName = operationName, uid = uid) - } - // scalastyle:on - -} - -final class TripleInteractionsModel private[op](val mean: Double, operationName: String, uid: String) - extends TernaryModel[MultiPickList, Binary, RealMap, Real](operationName = operationName, uid = uid) - with TripleInteractions { - - def transformFn: (MultiPickList, Binary, RealMap) => Real = (g: MultiPickList, s: Binary, nm: RealMap) => new Real( - if (!survivedAndMatches(g.value, s.value, nm.value)) None - else Some(nm.value(g.value.head) - mean) - ) - -} - -sealed trait TripleInteractions { - def survivedAndMatches(g: MultiPickList#Value, s: Binary#Value, nm: RealMap#Value): Boolean = - !s.getOrElse(false) && g.nonEmpty && nm.contains(g.head) -} diff --git a/core/src/test/scala/com/salesforce/op/stages/base/unary/UnaryEstimatorTest.scala b/core/src/test/scala/com/salesforce/op/stages/base/unary/UnaryEstimatorTest.scala deleted file mode 100644 index fd14ddac66..0000000000 --- a/core/src/test/scala/com/salesforce/op/stages/base/unary/UnaryEstimatorTest.scala +++ /dev/null @@ -1,128 +0,0 @@ -/* - * Copyright (c) 2017, Salesforce.com, Inc. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - * - * 3. Neither the name of Salesforce.com nor the names of its contributors may - * be used to endorse or promote products derived from this software without - * specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ - -package com.salesforce.op.stages.base.unary - -import com.salesforce.op.UID -import com.salesforce.op.features.Feature -import com.salesforce.op.features.types._ -import com.salesforce.op.test.{TestFeatureBuilder, TestSparkContext} -import com.salesforce.op.utils.spark.RichDataset._ -import org.apache.spark.ml.param.ParamMap -import org.apache.spark.sql.Dataset -import org.apache.spark.sql.types.{DoubleType, MetadataBuilder, StructField, StructType} -import org.junit.runner.RunWith -import org.scalatest.FlatSpec -import org.scalatest.junit.JUnitRunner - - -@RunWith(classOf[JUnitRunner]) -class UnaryEstimatorTest extends FlatSpec with TestSparkContext { - - val (ds, f1) = TestFeatureBuilder(Seq(1.0, 5.0, 3.0, 2.0, 6.0).toReal) - - val testEstimator: UnaryEstimator[Real, Real] = new MinMaxNormEstimator() - - Spec[UnaryEstimator[_, _]] should "throw an error if you try to get the output without setting the inputs" in { - intercept[java.util.NoSuchElementException](testEstimator.getOutput()) - } - - it should "return a copy with the same uid" in { - val newData = new MetadataBuilder().putLong("myKey", 100).build() - val copyWithValues = testEstimator.copy( - new ParamMap().put(testEstimator.outputMetadata, newData) - ) - - copyWithValues.isInstanceOf[UnaryEstimator[_, _]] - copyWithValues.uid shouldBe testEstimator.uid - copyWithValues.getMetadata() shouldBe newData - } - - it should "return a single output feature of the correct type" in { - val outputFeatures = testEstimator.setInput(f1).getOutput() - - outputFeatures shouldBe new Feature[Real]( - name = testEstimator.getOutputFeatureName, - originStage = testEstimator, - isResponse = false, - parents = Array(f1) - ) - } - - it should "return a UnaryModel with the estimator as the parent, a working copy method and the same uid" + - " and the correct function" in { - val testModel = testEstimator.setInput(f1).fit(ds) - - testModel.parent shouldBe testEstimator - testModel.transformFn(1.0.toReal) shouldBe 0.0.toReal - testModel.copy(new ParamMap()).uid shouldBe testEstimator.uid - } - - it should "create a UnaryModel transformer when it is fit" in { - val testModel = testEstimator.setInput(f1).fit(ds) - val testDataTransformed = testModel.setInput(f1).transform(ds) - val outputFeatures = testModel.getOutput() - val transformedValues = testDataTransformed.collect(f1, outputFeatures) - - val expectedTypes = - StructType(Seq(StructField(f1.name, DoubleType, true), - StructField(outputFeatures.name, DoubleType, true))) - - testDataTransformed.schema shouldEqual expectedTypes - transformedValues shouldEqual - Array((1.0, 0.0), (5.0, 0.8), (3.0, 0.4), (2.0, 0.2), (6.0, 1.0)).map(v => v._1.toReal -> v._2.toReal) - } - - it should "copy itself and the model successfully" in { - val est = new MinMaxNormEstimator() - val mod = new MinMaxNormEstimatorModel(0.0, 0.0, est.operationName, est.uid) - - est.copy(new ParamMap()).uid shouldBe est.uid - mod.copy(new ParamMap()).uid shouldBe mod.uid - } - -} - -class MinMaxNormEstimator(uid: String = UID[MinMaxNormEstimator]) - extends UnaryEstimator[Real, Real](operationName = "minMaxNorm", uid = uid) { - - def fitFn(dataset: Dataset[Real#Value]): UnaryModel[Real, Real] = { - val grouped = dataset.groupBy() - val maxVal = grouped.max().first().getDouble(0) - val minVal = grouped.min().first().getDouble(0) - new MinMaxNormEstimatorModel(min = minVal, max = maxVal, operationName = operationName, uid = uid) - } -} - -final class MinMaxNormEstimatorModel private[op](val min: Double, val max: Double, operationName: String, uid: String) - extends UnaryModel[Real, Real](operationName = operationName, uid = uid) { - def transformFn: Real => Real = _.v.map(v => (v - min) / (max - min)).toReal -} diff --git a/core/src/test/scala/com/salesforce/op/stages/base/unary/UnaryTransformerTest.scala b/core/src/test/scala/com/salesforce/op/stages/base/unary/UnaryTransformerTest.scala deleted file mode 100644 index 8548be79d1..0000000000 --- a/core/src/test/scala/com/salesforce/op/stages/base/unary/UnaryTransformerTest.scala +++ /dev/null @@ -1,91 +0,0 @@ -/* - * Copyright (c) 2017, Salesforce.com, Inc. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - * - * 3. Neither the name of Salesforce.com nor the names of its contributors may - * be used to endorse or promote products derived from this software without - * specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ - -package com.salesforce.op.stages.base.unary - -import com.salesforce.op.features.Feature -import com.salesforce.op.features.types._ -import com.salesforce.op.test.PassengerSparkFixtureTest -import com.salesforce.op.utils.spark.RichDataset._ -import com.salesforce.op.utils.spark.RichRow._ -import org.apache.spark.ml.param.ParamMap -import org.junit.runner.RunWith -import org.scalatest.junit.JUnitRunner -import org.scalatest.{FlatSpec, Matchers} - -@RunWith(classOf[JUnitRunner]) -class UnaryTransformerTest extends FlatSpec with PassengerSparkFixtureTest { - - val scaleBy2 = new UnaryLambdaTransformer[Real, Real](operationName = "unary", - transformFn = r => r.v.map(_ * 2.0).toReal - ) - - val toCat = new UnaryLambdaTransformer[Real, MultiPickList](operationName = "cat", - transformFn = value => Set(value.v.getOrElse(0.0).toString).toMultiPickList - ) - - Spec[UnaryLambdaTransformer[_, _]] should "return single properly formed Feature" in { - scaleBy2.setInput(weight) - val feats = scaleBy2.getOutput() - - feats shouldBe new Feature[Real]( - name = scaleBy2.getOutputFeatureName, - originStage = scaleBy2, - isResponse = false, - parents = Array(weight) - ) - } - - it should "add column to DataFrame when transformed" in { - scaleBy2.setInput(weight) - val transformedData = scaleBy2.transform(passengersDataSet) - val output = scaleBy2.getOutput() - val answer = passengersArray.map(r => scaleBy2.transformFn(r.getFeatureType[Real](weight))) - transformedData.columns.contains(scaleBy2.getOutputFeatureName) shouldBe true - transformedData.collect(output) shouldBe answer - } - - it should "work when returning a MultiPickList feature" in { - toCat.setInput(weight) - val transformedData = toCat.transform(passengersDataSet) - val output = toCat.getOutput() - val answer = passengersArray.map(r => toCat.transformFn(r.getFeatureType[Real](weight))) - transformedData.columns.contains(toCat.getOutputFeatureName) shouldBe true - transformedData.collect(output) shouldBe answer - } - - it should "copy successfully" in { - val copy = scaleBy2.copy(new ParamMap()) - copy shouldBe a[UnaryTransformer[_, _]] - copy.uid shouldBe scaleBy2.uid - } - -} diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/PredictionEquality.scala b/core/src/test/scala/com/salesforce/op/stages/impl/PredictionEquality.scala new file mode 100644 index 0000000000..5bfe525a6e --- /dev/null +++ b/core/src/test/scala/com/salesforce/op/stages/impl/PredictionEquality.scala @@ -0,0 +1,51 @@ +/* + * Copyright (c) 2017, Salesforce.com, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of Salesforce.com nor the names of its contributors may + * be used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +package com.salesforce.op.stages.impl + +import com.salesforce.op.features.types.Prediction +import com.salesforce.op.test.OpEstimatorSpec +import org.scalactic.Equality + +trait PredictionEquality { + + self: OpEstimatorSpec[Prediction, _, _] => + + abstract override implicit val featureTypeEquality = new Equality[Prediction] { + def areEqual(a: Prediction, b: Any): Boolean = b match { + case s: Prediction => + val keyset = a.v.keySet.union(s.v.keySet) + keyset.forall(k => math.abs(a.v(k) - s.v(k)) < 0.01) + case _ => false + } + } + +} diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/classification/BinaryClassificationModelSelectorTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/classification/BinaryClassificationModelSelectorTest.scala index 655dc29471..c375a58d5b 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/classification/BinaryClassificationModelSelectorTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/classification/BinaryClassificationModelSelectorTest.scala @@ -460,7 +460,7 @@ class BinaryClassificationModelSelectorTest extends FlatSpec with TestSparkConte testEstimator.evaluators.foreach { case evaluator: OpBinaryClassificationEvaluator => { - MultiClassEvalMetrics.values.foreach(metric => + BinaryClassEvalMetrics.values.foreach(metric => Seq(trainMetaData, holdOutMetaData).foreach( metadata => assert(metadata.contains(s"(${OpEvaluatorNames.binary})_${metric.entryName}"), s"Metric ${metric.entryName} is not present in metadata: " + metadata.json) diff --git a/core/src/test/scala/org/apache/spark/ml/classification/OpClassifierModelTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpClassifierModelTest.scala similarity index 65% rename from core/src/test/scala/org/apache/spark/ml/classification/OpClassifierModelTest.scala rename to core/src/test/scala/com/salesforce/op/stages/impl/classification/OpClassifierModelTest.scala index 0c72bf52af..0cdb9c5873 100644 --- a/core/src/test/scala/org/apache/spark/ml/classification/OpClassifierModelTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpClassifierModelTest.scala @@ -29,18 +29,18 @@ * POSSIBILITY OF SUCH DAMAGE. */ -package org.apache.spark.ml.classification +package com.salesforce.op.stages.impl.classification -import com.salesforce.op.features.types.{OPVector, Prediction, RealNN} +import com.salesforce.op.features.types.{Prediction, RealNN} +import com.salesforce.op.stages.sparkwrappers.specific.SparkModelConverter._ import com.salesforce.op.test._ import com.salesforce.op.testkit._ -import org.apache.spark.sql.{DataFrame, Dataset} +import org.apache.spark.ml.classification._ +import org.apache.spark.ml.linalg.Vector +import org.apache.spark.sql.DataFrame import org.junit.runner.RunWith import org.scalatest.FlatSpec import org.scalatest.junit.JUnitRunner -import org.apache.spark.ml.SparkModelConverter._ -import org.apache.spark.ml.linalg.Vector - @RunWith(classOf[JUnitRunner]) @@ -62,7 +62,7 @@ class OpClassifierModelTest extends FlatSpec with TestSparkContext { .setLabelCol(labelF.name) .fit(rawDF) - val op = toOP(Some(spk)).setInput(labelF, featureV) + val op = toOP(spk, spk.uid).setInput(labelF, featureV) compareOutputs(spk.transform(rawDF), op.transform(rawDF)) } @@ -70,22 +70,24 @@ class OpClassifierModelTest extends FlatSpec with TestSparkContext { Spec[OpLogisticRegressionModel] should "produce the same values as the spark version" in { val spk = new LogisticRegression() + .setFamily("multinomial") .setFeaturesCol(featureV.name) .setLabelCol(labelF.name) .fit(rawDF) - val op = toOP(Some(spk)).setInput(labelF, featureV) + val op = toOP(spk, spk.uid).setInput(labelF, featureV) compareOutputs(spk.transform(rawDF), op.transform(rawDF)) } Spec[OpNaiveBayesModel] should "produce the same values as the spark version" in { val spk = new NaiveBayes() + .setModelType("multinomial") .setFeaturesCol(featureV.name) .setLabelCol(labelF.name) .fit(rawDF) - val op = toOP(Some(spk), isMultinomial = true).setInput(labelF, featureV) + val op = toOP(spk, uid = spk.uid).setInput(labelF, featureV) compareOutputs(spk.transform(rawDF), op.transform(rawDF)) } @@ -96,11 +98,42 @@ class OpClassifierModelTest extends FlatSpec with TestSparkContext { .setLabelCol(labelF.name) .fit(rawDF) - val op = toOP(Some(spk)).setInput(labelF, featureV) + val op = toOP(spk, spk.uid).setInput(labelF, featureV) + + compareOutputs(spk.transform(rawDF), op.transform(rawDF)) + } + + Spec[OpGBTClassificationModel] should "produce the same values as the spark version" in { + val spk = new GBTClassifier() + .setFeaturesCol(featureV.name) + .setLabelCol(labelF.name) + .fit(rawDF) + val op = toOP(spk, spk.uid).setInput(labelF, featureV) compareOutputs(spk.transform(rawDF), op.transform(rawDF)) } + Spec[OpLinearSVCModel] should "produce the same values as the spark version" in { + val spk = new LinearSVC() + .setFeaturesCol(featureV.name) + .setLabelCol(labelF.name) + .fit(rawDF) + val op = toOP(spk, spk.uid).setInput(labelF, featureV) + + compareOutputsPred(spk.transform(rawDF), op.transform(rawDF), 3) + } + + Spec[OpMultilayerPerceptronClassificationModel] should "produce the same values as the spark version" in { + val spk = new MultilayerPerceptronClassifier() + .setLayers(Array(10, 5, 4, 2)) // this is hard to generalize input layer must = number of features + // output layer must equal number of labels + .setFeaturesCol(featureV.name) + .setLabelCol(labelF.name) + .fit(rawDF) + val op = toOP(spk, spk.uid).setInput(labelF, featureV) + compareOutputsPred(spk.transform(rawDF), op.transform(rawDF), 2) + } + def compareOutputs(df1: DataFrame, df2: DataFrame): Unit = { def keysStartsWith(name: String, value: Map[String, Double]): Array[Double] = { @@ -116,4 +149,13 @@ class OpClassifierModelTest extends FlatSpec with TestSparkContext { r1.getAs[Vector](2).toArray shouldEqual keysStartsWith(Prediction.Keys.RawPredictionName, map) } } + + def compareOutputsPred(df1: DataFrame, df2: DataFrame, predIndex: Int): Unit = { + val sorted1 = df1.collect().sortBy(_.getAs[Double](predIndex)) + val sorted2 = df2.collect().sortBy(_.getAs[Map[String, Double]](2)(Prediction.Keys.PredictionName)) + sorted1.zip(sorted2).foreach{ case (r1, r2) => + val map = r2.getAs[Map[String, Double]](2) + r1.getAs[Double](predIndex) shouldEqual map(Prediction.Keys.PredictionName) + } + } } diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpDecisionTreeClassifierTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpDecisionTreeClassifierTest.scala new file mode 100644 index 0000000000..7856f57585 --- /dev/null +++ b/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpDecisionTreeClassifierTest.scala @@ -0,0 +1,91 @@ +/* + * Copyright (c) 2017, Salesforce.com, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of Salesforce.com nor the names of its contributors may + * be used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +package com.salesforce.op.stages.impl.classification + +import com.salesforce.op.features.types._ +import com.salesforce.op.stages.impl.PredictionEquality +import com.salesforce.op.stages.sparkwrappers.specific.{OpPredictorWrapper, OpPredictorWrapperModel} +import com.salesforce.op.test.{OpEstimatorSpec, TestFeatureBuilder} +import org.apache.spark.ml.classification.{DecisionTreeClassificationModel, DecisionTreeClassifier} +import org.apache.spark.ml.linalg.Vectors +import org.junit.runner.RunWith +import org.scalatest.junit.JUnitRunner + + +@RunWith(classOf[JUnitRunner]) +class OpDecisionTreeClassifierTest extends OpEstimatorSpec[Prediction, + OpPredictorWrapperModel[DecisionTreeClassificationModel], + OpPredictorWrapper[DecisionTreeClassifier, DecisionTreeClassificationModel]] with PredictionEquality { + + val (inputData, rawFeature1, feature2) = TestFeatureBuilder("label", "features", + Seq[(RealNN, OPVector)]( + 1.0.toRealNN -> Vectors.dense(12.0, 4.3, 1.3).toOPVector, + 0.0.toRealNN -> Vectors.dense(0.0, 0.3, 0.1).toOPVector, + 0.0.toRealNN -> Vectors.dense(1.0, 3.9, 4.3).toOPVector, + 1.0.toRealNN -> Vectors.dense(10.0, 1.3, 0.9).toOPVector, + 1.0.toRealNN -> Vectors.dense(15.0, 4.7, 1.3).toOPVector, + 0.0.toRealNN -> Vectors.dense(0.5, 0.9, 10.1).toOPVector, + 1.0.toRealNN -> Vectors.dense(11.5, 2.3, 1.3).toOPVector, + 0.0.toRealNN -> Vectors.dense(0.1, 3.3, 0.1).toOPVector + ) + ) + val feature1 = rawFeature1.copy(isResponse = true) + val estimator = new OpDecisionTreeClassifier().setInput(feature1, feature2) + + val expectedResult = Seq( + Prediction(1.0, Array(0.0, 4.0), Array(0.0, 1.0)), + Prediction(0.0, Array(4.0, 0.0), Array(1.0, 0.0)), + Prediction(0.0, Array(4.0, 0.0), Array(1.0, 0.0)), + Prediction(1.0, Array(0.0, 4.0), Array(0.0, 1.0)), + Prediction(1.0, Array(0.0, 4.0), Array(0.0, 1.0)), + Prediction(0.0, Array(4.0, 0.0), Array(1.0, 0.0)), + Prediction(1.0, Array(0.0, 4.0), Array(0.0, 1.0)), + Prediction(0.0, Array(4.0, 0.0), Array(1.0, 0.0)) + ) + + + it should "allow the user to set the desired spark parameters" in { + estimator + .setMaxDepth(6) + .setMaxBins(2) + .setMinInstancesPerNode(2) + .setMinInfoGain(0.1) + estimator.fit(inputData) + + estimator.predictor.getMaxDepth shouldBe 6 + estimator.predictor.getMaxBins shouldBe 2 + estimator.predictor.getMinInstancesPerNode shouldBe 2 + estimator.predictor.getMinInfoGain shouldBe 0.1 + } +} + + diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpGBTClassifierTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpGBTClassifierTest.scala new file mode 100644 index 0000000000..49bba49cf0 --- /dev/null +++ b/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpGBTClassifierTest.scala @@ -0,0 +1,93 @@ +/* + * Copyright (c) 2017, Salesforce.com, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of Salesforce.com nor the names of its contributors may + * be used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +package com.salesforce.op.stages.impl.classification + +import com.salesforce.op.features.types._ +import com.salesforce.op.stages.impl.PredictionEquality +import com.salesforce.op.stages.sparkwrappers.specific.{OpPredictorWrapper, OpPredictorWrapperModel} +import com.salesforce.op.test.{OpEstimatorSpec, TestFeatureBuilder} +import org.apache.spark.ml.classification.{GBTClassificationModel, GBTClassifier} +import org.apache.spark.ml.linalg.Vectors +import org.junit.runner.RunWith +import org.scalatest.junit.JUnitRunner + + +@RunWith(classOf[JUnitRunner]) +class OpGBTClassifierTest extends OpEstimatorSpec[Prediction, OpPredictorWrapperModel[GBTClassificationModel], + OpPredictorWrapper[GBTClassifier, GBTClassificationModel]] with PredictionEquality { + + val (inputData, rawFeature1, feature2) = TestFeatureBuilder("label", "features", + Seq[(RealNN, OPVector)]( + 1.0.toRealNN -> Vectors.dense(12.0, 4.3, 1.3).toOPVector, + 0.0.toRealNN -> Vectors.dense(0.0, 0.3, 0.1).toOPVector, + 0.0.toRealNN -> Vectors.dense(1.0, 3.9, 4.3).toOPVector, + 1.0.toRealNN -> Vectors.dense(10.0, 1.3, 0.9).toOPVector, + 1.0.toRealNN -> Vectors.dense(15.0, 4.7, 1.3).toOPVector, + 0.0.toRealNN -> Vectors.dense(0.5, 0.9, 10.1).toOPVector, + 1.0.toRealNN -> Vectors.dense(11.5, 2.3, 1.3).toOPVector, + 0.0.toRealNN -> Vectors.dense(0.1, 3.3, 0.1).toOPVector + ) + ) + val feature1 = rawFeature1.copy(isResponse = true) + val estimator = new OpGBTClassifier().setInput(feature1, feature2) + + val expectedResult = Seq( + Prediction(1.0, Array(-1.54, 1.54), Array(0.04, 0.95)), + Prediction(0.0, Array(1.54, -1.54), Array(0.95, 0.04)), + Prediction(0.0, Array(1.54, -1.54), Array(0.95, 0.04)), + Prediction(1.0, Array(-1.54, 1.54), Array(0.04, 0.95)), + Prediction(1.0, Array(-1.54, 1.54), Array(0.04, 0.95)), + Prediction(0.0, Array(1.54, -1.54), Array(0.95, 0.04)), + Prediction(1.0, Array(-1.54, 1.54), Array(0.04, 0.95)), + Prediction(0.0, Array(1.54, -1.54), Array(0.95, 0.04)) + ) + + + it should "allow the user to set the desired spark parameters" in { + estimator + .setMaxIter(10) + .setMaxDepth(6) + .setMaxBins(2) + .setMinInstancesPerNode(2) + .setMinInfoGain(0.1) + estimator.fit(inputData) + + estimator.predictor.getMaxIter shouldBe 10 + estimator.predictor.getMaxDepth shouldBe 6 + estimator.predictor.getMaxBins shouldBe 2 + estimator.predictor.getMinInstancesPerNode shouldBe 2 + estimator.predictor.getMinInfoGain shouldBe 0.1 + + } +} + + diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpLinearSVCTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpLinearSVCTest.scala new file mode 100644 index 0000000000..83e13e3784 --- /dev/null +++ b/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpLinearSVCTest.scala @@ -0,0 +1,88 @@ +/* + * Copyright (c) 2017, Salesforce.com, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of Salesforce.com nor the names of its contributors may + * be used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +package com.salesforce.op.stages.impl.classification + +import com.salesforce.op.features.types._ +import com.salesforce.op.stages.impl.PredictionEquality +import com.salesforce.op.stages.sparkwrappers.specific.{OpPredictorWrapper, OpPredictorWrapperModel} +import com.salesforce.op.test.{OpEstimatorSpec, TestFeatureBuilder} +import org.apache.spark.ml.classification.{LinearSVC, LinearSVCModel} +import org.apache.spark.ml.linalg.Vectors +import org.junit.runner.RunWith +import org.scalatest.junit.JUnitRunner + + +@RunWith(classOf[JUnitRunner]) +class OpLinearSVCTest extends OpEstimatorSpec[Prediction, OpPredictorWrapperModel[LinearSVCModel], + OpPredictorWrapper[LinearSVC, LinearSVCModel]] with PredictionEquality { + + val (inputData, rawFeature1, feature2) = TestFeatureBuilder("label", "features", + Seq[(RealNN, OPVector)]( + 1.0.toRealNN -> Vectors.dense(12.0, 4.3, 1.3).toOPVector, + 0.0.toRealNN -> Vectors.dense(0.0, 0.3, 0.1).toOPVector, + 0.0.toRealNN -> Vectors.dense(1.0, 3.9, 4.3).toOPVector, + 1.0.toRealNN -> Vectors.dense(10.0, 1.3, 0.9).toOPVector, + 1.0.toRealNN -> Vectors.dense(15.0, 4.7, 1.3).toOPVector, + 0.0.toRealNN -> Vectors.dense(0.5, 0.9, 10.1).toOPVector, + 1.0.toRealNN -> Vectors.dense(11.5, 2.3, 1.3).toOPVector, + 0.0.toRealNN -> Vectors.dense(0.1, 3.3, 0.1).toOPVector + ) + ) + val feature1 = rawFeature1.copy(isResponse = true) + val estimator = new OpLinearSVC().setInput(feature1, feature2) + + val expectedResult = Seq( + Prediction(1.0, Vectors.dense(Array(-1.33, 1.33))), + Prediction(0.0, Vectors.dense(Array(1.04, -1.04))), + Prediction(0.0, Vectors.dense(Array(2.69, -2.69))), + Prediction(1.0, Vectors.dense(Array(-1.32, 1.32))), + Prediction(1.0, Vectors.dense(Array(-2.11, 2.11))), + Prediction(0.0, Vectors.dense(Array(4.41, -4.41))), + Prediction(1.0, Vectors.dense(Array(-1.46, 1.46))), + Prediction(0.0, Vectors.dense(Array(1.42, -1.42))) + ) + + + it should "allow the user to set the desired spark parameters" in { + estimator + .setRegParam(0.1) + .setMaxIter(20) + .setTol(1E-4) + estimator.fit(inputData) + + estimator.predictor.getRegParam shouldBe 0.1 + estimator.predictor.getMaxIter shouldBe 20 + estimator.predictor.getTol shouldBe 1E-4 + } +} + + diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpLogisticRegressionTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpLogisticRegressionTest.scala index 1eac82c03b..d647a6ccb5 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpLogisticRegressionTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpLogisticRegressionTest.scala @@ -32,19 +32,20 @@ package com.salesforce.op.stages.impl.classification import com.salesforce.op.features.types._ -import com.salesforce.op.stages.sparkwrappers.generic._ -import com.salesforce.op.test.{TestFeatureBuilder, TestSparkContext} -import org.apache.spark.ml.classification.LogisticRegressionModel +import com.salesforce.op.stages.impl.PredictionEquality +import com.salesforce.op.stages.sparkwrappers.specific.{OpPredictorWrapper, OpPredictorWrapperModel} +import com.salesforce.op.test.{OpEstimatorSpec, TestFeatureBuilder} +import org.apache.spark.ml.classification.{LogisticRegression, LogisticRegressionModel} import org.apache.spark.ml.linalg.Vectors import org.junit.runner.RunWith import org.scalatest.junit.JUnitRunner -import org.scalatest.{FlatSpec, Matchers} @RunWith(classOf[JUnitRunner]) -class OpLogisticRegressionTest extends FlatSpec with TestSparkContext { +class OpLogisticRegressionTest extends OpEstimatorSpec[Prediction, OpPredictorWrapperModel[LogisticRegressionModel], + OpPredictorWrapper[LogisticRegression, LogisticRegressionModel]] with PredictionEquality { - val (testData, rawFeature1, feature2) = TestFeatureBuilder("label", "features", + val (inputData, rawFeature1, feature2) = TestFeatureBuilder("label", "features", Seq[(RealNN, OPVector)]( 1.0.toRealNN -> Vectors.dense(12.0, 4.3, 1.3).toOPVector, 0.0.toRealNN -> Vectors.dense(0.0, 0.3, 0.1).toOPVector, @@ -57,102 +58,31 @@ class OpLogisticRegressionTest extends FlatSpec with TestSparkContext { ) ) val feature1 = rawFeature1.copy(isResponse = true) - val logReg = new OpLogisticRegression().setInput(feature1, feature2) - - Spec[OpLogisticRegression] should "have properly formed stage1" in { - assert(logReg.stage1.isInstanceOf[SwBinaryEstimator[_, _, _, _, _]]) - val inputNames = logReg.stage1.getInputFeatures().map(_.name) - inputNames should have length 2 - inputNames shouldBe Array(feature1.name, feature2.name) - logReg.stage1.getOutput().name shouldBe logReg.stage1.getOutputFeatureName - the[IllegalArgumentException] thrownBy { - logReg.setInput(feature1.copy(isResponse = true), feature2.copy(isResponse = true)) - } should have message "The feature vector should not contain any response features." - } - - it should "have properly formed stage2" in { - assert(logReg.stage2.isInstanceOf[SwTernaryTransformer[_, _, _, _, _]]) - val inputNames = logReg.stage2.getInputFeatures().map(_.name) - inputNames should have length 3 - inputNames shouldBe Array(feature1.name, feature2.name, logReg.stage1.getOutputFeatureName) - logReg.stage2.getOutput().name shouldBe logReg.stage2.getOutputFeatureName - - } - - it should "have properly formed stage3" in { - assert(logReg.stage3.isInstanceOf[SwQuaternaryTransformer[_, _, _, _, _, _]]) - val inputNames = logReg.stage3.getInputFeatures().map(_.name) - inputNames should have length 4 - inputNames shouldBe Array(feature1.name, feature2.name, logReg.stage1.getOutputFeatureName, - logReg.stage2.getOutputFeatureName) - - logReg.stage3.getOutput().name shouldBe logReg.stage3.getOutputFeatureName - } - - it should "have proper outputs corresponding to the stages" in { - val outputs = logReg.getOutput() - outputs._1.name shouldBe logReg.stage1.getOutput().name - outputs._2.name shouldBe logReg.stage2.getOutput().name - outputs._3.name shouldBe logReg.stage3.getOutput().name - - // as long as the parent stages are correct, we can also assume - // that the parent features are correct, since that should - // be verified in the unit tests for the transformers. - outputs._1.originStage shouldBe logReg.stage1 - outputs._2.originStage shouldBe logReg.stage2 - outputs._3.originStage shouldBe logReg.stage3 - } - - - it should "return a properly formed LogisticRegressionModel when fitted" in { - val model = logReg.setSparkParams("maxIter", 10).fit(testData) - - model shouldBe a[SwThreeStageBinaryModel[_, _, _, _, _, _]] - model.stage1 shouldBe a[SwBinaryModel[_, _, _, _]] - - val sparkStage = model.stage1.getSparkMlStage() - sparkStage.get.isInstanceOf[LogisticRegressionModel] - assert(model.stage2.getSparkMlStage().isEmpty) - assert(model.stage3.getSparkMlStage().isEmpty) - - model.stage1OperationName shouldBe "LogisticRegression_predictionCol" - model.stage2OperationName shouldBe "LogisticRegression_rawPredictionCol" - model.stage3OperationName shouldBe "LogisticRegression_probabilityCol" - - val inputNames = model.getInputFeatures().map(_.name) - inputNames should have length 2 - inputNames shouldBe Array(feature1.name, feature2.name) - } + val estimator = new OpLogisticRegression().setInput(feature1, feature2) + + val expectedResult = Seq( + Prediction(1.0, Array(-20.88, 20.88), Array(0.0, 1.0)), + Prediction(0.0, Array(16.70, -16.7), Array(1.0, 0.0)), + Prediction(0.0, Array(22.2, -22.2), Array(1.0, 0.0)), + Prediction(1.0, Array(-18.35, 18.35), Array(0.0, 1.0)), + Prediction(1.0, Array(-31.46, 31.46), Array(0.0, 1.0)), + Prediction(0.0, Array(24.67, -24.67), Array(1.0, 0.0)), + Prediction(1.0, Array(-22.07, 22.07), Array(0.0, 1.0)), + Prediction(0.0, Array(20.9, -20.9), Array(1.0, 0.0)) + ) it should "allow the user to set the desired spark parameters" in { - logReg.setSparkParams("maxIter", 10).setSparkParams("regParam", 0.1) - logReg.getSparkParams("maxIter") shouldBe Some(10) - logReg.getSparkParams("regParam") shouldBe Some(0.1) - - logReg.setThresholds(Array(0.03, 0.06)).setElasticNetParam(0.1) - logReg.getSparkParams("thresholds").get.asInstanceOf[Array[Double]] should contain theSameElementsAs - Array(0.03, 0.06) - logReg.getSparkParams("elasticNetParam") shouldBe Some(0.1) + estimator + .setRegParam(0.1) + .setElasticNetParam(0.1) + .setMaxIter(20) + estimator.fit(inputData) + + estimator.predictor.getRegParam shouldBe 0.1 + estimator.predictor.getElasticNetParam shouldBe 0.1 + estimator.predictor.getMaxIter shouldBe 20 } - - // TODO: move this to OpWorkFlowTest - // it should "work in a workflow" in { - // val (prob, rawpred, pred) = logReg.getOutput() - // val workflow = new OpWorkflow().setResultFeatures(pred) - // - // val reader = DataReaders.Simple.custom[LRDataTest]( - // readFn = (s: Option[String], spk: SparkSession) => spk.sparkContext.parallelize(DataTest.input) - // ) - // - // val workflowModel = workflow.setReader(reader).train() - // val scores = workflowModel.score() - // val justScores = scores.select(s"(label)_(features)_((label)_(features)_${stageNames(0)})_" + - // s"((label)_(features)_((label)_(features)_${stageNames(0)})_${stageNames(1)})_${stageNames(2)}") - // .collect().map(_.getAs[Double](0)).toList - // justScores shouldEqual List(1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0) - // } - } diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpMultilayerPerceptronClassifierTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpMultilayerPerceptronClassifierTest.scala new file mode 100644 index 0000000000..19efea8605 --- /dev/null +++ b/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpMultilayerPerceptronClassifierTest.scala @@ -0,0 +1,92 @@ +/* + * Copyright (c) 2017, Salesforce.com, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of Salesforce.com nor the names of its contributors may + * be used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +package com.salesforce.op.stages.impl.classification + +import com.salesforce.op.features.types._ +import com.salesforce.op.stages.impl.PredictionEquality +import com.salesforce.op.stages.sparkwrappers.specific.{OpPredictorWrapper, OpPredictorWrapperModel} +import com.salesforce.op.test.{OpEstimatorSpec, TestFeatureBuilder} +import org.apache.spark.ml.classification.{MultilayerPerceptronClassificationModel, MultilayerPerceptronClassifier} +import org.apache.spark.ml.linalg.Vectors +import org.junit.runner.RunWith +import org.scalatest.junit.JUnitRunner + + +@RunWith(classOf[JUnitRunner]) +class OpMultilayerPerceptronClassifierTest extends OpEstimatorSpec[Prediction, + OpPredictorWrapperModel[MultilayerPerceptronClassificationModel], + OpPredictorWrapper[MultilayerPerceptronClassifier, MultilayerPerceptronClassificationModel]] with PredictionEquality { + + val (inputData, rawFeature1, feature2) = TestFeatureBuilder("label", "features", + Seq[(RealNN, OPVector)]( + 1.0.toRealNN -> Vectors.dense(12.0, 4.3, 1.3).toOPVector, + 0.0.toRealNN -> Vectors.dense(0.0, 0.3, 0.1).toOPVector, + 0.0.toRealNN -> Vectors.dense(1.0, 3.9, 4.3).toOPVector, + 1.0.toRealNN -> Vectors.dense(10.0, 1.3, 0.9).toOPVector, + 1.0.toRealNN -> Vectors.dense(15.0, 4.7, 1.3).toOPVector, + 0.0.toRealNN -> Vectors.dense(0.5, 0.9, 10.1).toOPVector, + 1.0.toRealNN -> Vectors.dense(11.5, 2.3, 1.3).toOPVector, + 0.0.toRealNN -> Vectors.dense(0.1, 3.3, 0.1).toOPVector + ) + ) + val feature1 = rawFeature1.copy(isResponse = true) + val estimator = new OpMultilayerPerceptronClassifier() + .setInput(feature1, feature2) + .setLayers(Array(3, 5, 4, 2)) + + + val expectedResult = Seq( + Prediction(1.0), + Prediction(0.0), + Prediction(0.0), + Prediction(1.0), + Prediction(1.0), + Prediction(0.0), + Prediction(1.0), + Prediction(0.0) + ) + + + it should "allow the user to set the desired spark parameters" in { + estimator + .setMaxIter(50) + .setBlockSize(2) + .setSeed(42) + estimator.fit(inputData) + + estimator.predictor.getMaxIter shouldBe 50 + estimator.predictor.getBlockSize shouldBe 2 + estimator.predictor.getSeed shouldBe 42 + } +} + + diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpNaiveBayesTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpNaiveBayesTest.scala new file mode 100644 index 0000000000..a215cef91c --- /dev/null +++ b/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpNaiveBayesTest.scala @@ -0,0 +1,84 @@ +/* + * Copyright (c) 2017, Salesforce.com, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of Salesforce.com nor the names of its contributors may + * be used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +package com.salesforce.op.stages.impl.classification + +import com.salesforce.op.features.types._ +import com.salesforce.op.stages.impl.PredictionEquality +import com.salesforce.op.stages.sparkwrappers.specific.{OpPredictorWrapper, OpPredictorWrapperModel} +import com.salesforce.op.test.{OpEstimatorSpec, TestFeatureBuilder} +import org.apache.spark.ml.classification.{NaiveBayes, NaiveBayesModel} +import org.apache.spark.ml.linalg.Vectors +import org.junit.runner.RunWith +import org.scalatest.junit.JUnitRunner + + +@RunWith(classOf[JUnitRunner]) +class OpNaiveBayesTest extends OpEstimatorSpec[Prediction, OpPredictorWrapperModel[NaiveBayesModel], + OpPredictorWrapper[NaiveBayes, NaiveBayesModel]] with PredictionEquality { + + val (inputData, rawFeature1, feature2) = TestFeatureBuilder("label", "features", + Seq[(RealNN, OPVector)]( + 1.0.toRealNN -> Vectors.dense(12.0, 4.3, 1.3).toOPVector, + 0.0.toRealNN -> Vectors.dense(0.0, 0.3, 0.1).toOPVector, + 0.0.toRealNN -> Vectors.dense(1.0, 3.9, 4.3).toOPVector, + 1.0.toRealNN -> Vectors.dense(10.0, 1.3, 0.9).toOPVector, + 1.0.toRealNN -> Vectors.dense(15.0, 4.7, 1.3).toOPVector, + 0.0.toRealNN -> Vectors.dense(0.5, 0.9, 10.1).toOPVector, + 1.0.toRealNN -> Vectors.dense(11.5, 2.3, 1.3).toOPVector, + 0.0.toRealNN -> Vectors.dense(0.1, 3.3, 0.1).toOPVector + ) + ) + val feature1 = rawFeature1.copy(isResponse = true) + val estimator = new OpNaiveBayes().setInput(feature1, feature2) + + val expectedResult = Seq( + Prediction(1.0, Array(-34.41, -14.85), Array(0.0, 1.0)), + Prediction(0.0, Array(-1.07, -1.42), Array(0.58, 0.41)), + Prediction(0.0, Array(-9.70, -17.99), Array(1.0, 0.0)), + Prediction(1.0, Array(-26.22, -8.33), Array(0.0, 1.0)), + Prediction(1.0, Array(-41.93, -16.49), Array(0.0, 1.0)), + Prediction(0.0, Array(-8.60, -27.31), Array(1.0, 0.0)), + Prediction(1.0, Array(-31.07, -11.44), Array(0.0, 1.0)), + Prediction(0.0, Array(-4.54, -6.32), Array(0.85, 0.14)) + ) + + + it should "allow the user to set the desired spark parameters" in { + estimator + .setSmoothing(2) + estimator.fit(inputData) + + estimator.predictor.getSmoothing shouldBe 2 + } +} + + diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpRandomForestClassifierTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpRandomForestClassifierTest.scala new file mode 100644 index 0000000000..ed39afa3c5 --- /dev/null +++ b/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpRandomForestClassifierTest.scala @@ -0,0 +1,103 @@ +/* + * Copyright (c) 2017, Salesforce.com, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of Salesforce.com nor the names of its contributors may + * be used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +package com.salesforce.op.stages.impl.classification + +import com.salesforce.op.features.types._ +import com.salesforce.op.stages.impl.PredictionEquality +import com.salesforce.op.stages.sparkwrappers.specific.{OpPredictorWrapper, OpPredictorWrapperModel} +import com.salesforce.op.test.{OpEstimatorSpec, TestFeatureBuilder} +import org.apache.spark.ml.classification.{RandomForestClassificationModel, RandomForestClassifier} +import org.apache.spark.ml.linalg.Vectors +import org.junit.runner.RunWith +import org.scalatest.junit.JUnitRunner + +@RunWith(classOf[JUnitRunner]) +class OpRandomForestClassifierTest extends + OpEstimatorSpec[Prediction, OpPredictorWrapperModel[RandomForestClassificationModel], + OpPredictorWrapper[RandomForestClassifier, RandomForestClassificationModel]] with PredictionEquality { + + lazy val (inputData, rawLabelMulti, featuresMulti) = + TestFeatureBuilder[RealNN, OPVector]("labelMulti", "featuresMulti", + Seq( + (1.0.toRealNN, Vectors.dense(12.0, 4.3, 1.3).toOPVector), + (0.0.toRealNN, Vectors.dense(0.0, 0.3, 0.1).toOPVector), + (2.0.toRealNN, Vectors.dense(1.0, 3.9, 4.3).toOPVector), + (2.0.toRealNN, Vectors.dense(10.0, 1.3, 0.9).toOPVector), + (1.0.toRealNN, Vectors.dense(15.0, 4.7, 1.3).toOPVector), + (0.0.toRealNN, Vectors.dense(0.5, 0.9, 10.1).toOPVector), + (1.0.toRealNN, Vectors.dense(11.5, 2.3, 1.3).toOPVector), + (0.0.toRealNN, Vectors.dense(0.1, 3.3, 0.1).toOPVector), + (2.0.toRealNN, Vectors.dense(1.0, 4.0, 4.5).toOPVector), + (2.0.toRealNN, Vectors.dense(10.0, 1.5, 1.0).toOPVector) + ) + ) + + val labelMulti = rawLabelMulti.copy(isResponse = true) + + val estimator = new OpRandomForestClassifier().setInput(labelMulti, featuresMulti) + + val expectedResult = Seq( + Prediction(1.0, Array(0.0, 17.0, 3.0), Array(0.0, 0.85, 0.15)), + Prediction(0.0, Array(19.0, 0.0, 1.0), Array(0.95, 0.0, 0.05)), + Prediction(2.0, Array(0.0, 1.0, 19.0), Array(0.0, 0.05, 0.95)), + Prediction(2.0, Array(1.0, 2.0, 17.0), Array(0.05, 0.1, 0.85)), + Prediction(1.0, Array(0.0, 17.0, 3.0), Array(0.0, 0.85, 0.15)), + Prediction(0.0, Array(16.0, 0.0, 4.0), Array(0.8, 0.0, 0.2)), + Prediction(1.0, Array(1.0, 17.0, 2.0), Array(0.05, 0.85, 0.1)), + Prediction(0.0, Array(17.0, 0.0, 3.0), Array(0.85, 0.0, 0.15)), + Prediction(2.0, Array(2.0, 1.0, 17.0), Array(0.1, 0.05, 0.85)), + Prediction(2.0, Array(1.0, 2.0, 17.0), Array(0.05, 0.1, 0.85)) + ) + + it should "allow the user to set the desired spark parameters" in { + estimator + .setMaxDepth(10) + .setImpurity(Impurity.Gini.sparkName) + .setMaxBins(33) + .setMinInstancesPerNode(2) + .setMinInfoGain(0.2) + .setSubsamplingRate(0.9) + .setNumTrees(21) + .setSeed(2L) + estimator.fit(inputData) + + estimator.predictor.getMaxDepth shouldBe 10 + estimator.predictor.getMaxBins shouldBe 33 + estimator.predictor.getImpurity shouldBe Impurity.Gini.sparkName + estimator.predictor.getMinInstancesPerNode shouldBe 2 + estimator.predictor.getMinInfoGain shouldBe 0.2 + estimator.predictor.getSubsamplingRate shouldBe 0.9 + estimator.predictor.getNumTrees shouldBe 21 + estimator.predictor.getSeed shouldBe 2L + } + +} diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpRandomForestTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpRandomForestTest.scala deleted file mode 100644 index 7c38c7474f..0000000000 --- a/core/src/test/scala/com/salesforce/op/stages/impl/classification/OpRandomForestTest.scala +++ /dev/null @@ -1,167 +0,0 @@ -/* - * Copyright (c) 2017, Salesforce.com, Inc. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - * - * 3. Neither the name of Salesforce.com nor the names of its contributors may - * be used to endorse or promote products derived from this software without - * specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ - -package com.salesforce.op.stages.impl.classification - -import com.salesforce.op._ -import com.salesforce.op.features.types._ -import com.salesforce.op.stages.impl.classification.Impurity.Gini -import com.salesforce.op.stages.sparkwrappers.generic._ -import com.salesforce.op.test.{TestFeatureBuilder, TestSparkContext} -import org.apache.spark.ml.classification.RandomForestClassificationModel -import org.apache.spark.ml.linalg.Vectors -import org.junit.runner.RunWith -import org.scalatest.junit.JUnitRunner -import org.scalatest.{FlatSpec, Matchers} - -@RunWith(classOf[JUnitRunner]) -class OpRandomForestTest extends FlatSpec with TestSparkContext { - - val stageNames = Array[String]("RandomForestClassifier_predictionCol", "RandomForestClassifier_rawPredictionCol", - "RandomForestClassifier_probabilityCol" - ) - - lazy val (testData, rawLabel, features) = TestFeatureBuilder[RealNN, OPVector]("label", "features", - Seq( - (1.0.toRealNN, Vectors.dense(12.0, 4.3, 1.3).toOPVector), - (0.0.toRealNN, Vectors.dense(0.0, 0.3, 0.1).toOPVector), - (0.0.toRealNN, Vectors.dense(1.0, 3.9, 4.3).toOPVector), - (1.0.toRealNN, Vectors.dense(10.0, 1.3, 0.9).toOPVector), - (1.0.toRealNN, Vectors.dense(15.0, 4.7, 1.3).toOPVector), - (0.0.toRealNN, Vectors.dense(0.5, 0.9, 10.1).toOPVector), - (1.0.toRealNN, Vectors.dense(11.5, 2.3, 1.3).toOPVector), - (0.0.toRealNN, Vectors.dense(0.1, 3.3, 0.1).toOPVector) - ) - ) - - val label = rawLabel.copy(isResponse = true) - - lazy val (multiClassTestData, rawLabelMulti, featuresMulti) = - TestFeatureBuilder[RealNN, OPVector]("labelMulti", "featuresMulti", - Seq( - (1.0.toRealNN, Vectors.dense(12.0, 4.3, 1.3).toOPVector), - (0.0.toRealNN, Vectors.dense(0.0, 0.3, 0.1).toOPVector), - (2.0.toRealNN, Vectors.dense(1.0, 3.9, 4.3).toOPVector), - (2.0.toRealNN, Vectors.dense(10.0, 1.3, 0.9).toOPVector), - (1.0.toRealNN, Vectors.dense(15.0, 4.7, 1.3).toOPVector), - (0.0.toRealNN, Vectors.dense(0.5, 0.9, 10.1).toOPVector), - (1.0.toRealNN, Vectors.dense(11.5, 2.3, 1.3).toOPVector), - (0.0.toRealNN, Vectors.dense(0.1, 3.3, 0.1).toOPVector), - (2.0.toRealNN, Vectors.dense(1.0, 4.0, 4.5).toOPVector), - (2.0.toRealNN, Vectors.dense(10.0, 1.5, 1.0).toOPVector) - ) - ) - - val labelMulti = rawLabelMulti.copy(isResponse = true) - - val randomForest = new OpRandomForest().setInput(label, features) - val outputs = randomForest.getOutput() - val (predName, rawName, probName) = (outputs._1.name, outputs._2.name, outputs._3.name) - - val randomForestMulti = new OpRandomForest().setInput(labelMulti, featuresMulti) - val outputsMulti = randomForestMulti.getOutput() - val (predNameMulti, rawNameMulti, probNameMulti) = (outputsMulti._1.name, outputsMulti._2.name, outputsMulti._3.name) - - Spec[OpRandomForest] should "allow the user to set the desired spark parameters" in { - randomForest.setThresholds(Array(1.0, 1.0)) - .setMaxDepth(10) - .setImpurity(Impurity.Gini) - .setMaxBins(33) - .setMinInstancesPerNode(2) - .setMinInfoGain(0.2) - .setSubsamplingRate(0.9) - .setNumTrees(21) - .setSeed(2L) - - randomForest.getSparkParams("thresholds").get.asInstanceOf[Array[Double]] should - contain theSameElementsAs Array(1.0, 1.0) - randomForest.getSparkParams("maxDepth").get.asInstanceOf[Int] shouldBe 10 - randomForest.getSparkParams("maxBins").get.asInstanceOf[Int] shouldBe 33 - randomForest.getSparkParams("impurity").get.asInstanceOf[String] shouldBe Impurity.Gini.sparkName - randomForest.getSparkParams("minInstancesPerNode").get.asInstanceOf[Int] shouldBe 2 - randomForest.getSparkParams("minInfoGain").get.asInstanceOf[Double] shouldBe 0.2 - randomForest.getSparkParams("subsamplingRate").get.asInstanceOf[Double] shouldBe 0.9 - randomForest.getSparkParams("numTrees").get.asInstanceOf[Int] shouldBe 21 - randomForest.getSparkParams("seed").get.asInstanceOf[Long] shouldBe 2L - } - - it should "return a properly formed Random Forest when fitted" in { - the[IllegalArgumentException] thrownBy { - randomForest.setInput(label.copy(isResponse = true), features.copy(isResponse = true)) - } should have message "The feature vector should not contain any response features." - - val model = randomForest.fit(testData) - - model shouldBe a[SwThreeStageBinaryModel[_, _, _, _, _, _]] - model.stage1 shouldBe a[SwBinaryModel[_, _, _, _]] - - val sparkStage = model.stage1.getSparkMlStage() - assert(sparkStage.get.isInstanceOf[RandomForestClassificationModel]) - assert(model.stage2.getSparkMlStage().isEmpty) - assert(model.stage3.getSparkMlStage().isEmpty) - - model.stage1OperationName shouldBe stageNames(0) - model.stage2OperationName shouldBe stageNames(1) - model.stage3OperationName shouldBe stageNames(2) - - val inputNames = model.getInputFeatures().map(_.name) - inputNames should have length 2 - inputNames shouldBe Array(label.name, features.name) - - val transformedData = model.transform(testData) - - val fields = transformedData.select(rawName, probName, predName).schema.fields - - fields.map(_.name).toList shouldBe List(rawName, probName, predName) - - fields.map(_.dataType.typeName).toList shouldBe List("vector", "vector", "double") - } - - it should "be implemented using shortcuts" in { - val (raw, prob, pred) = features.randomForest(label = label, impurity = Gini) - raw.name shouldBe raw.originStage.getOutputFeatureName - prob.name shouldBe prob.originStage.getOutputFeatureName - pred.name shouldBe pred.originStage.getOutputFeatureName - } - - it should "return a model for multiClassification problem" in { - the[IllegalArgumentException] thrownBy { - randomForestMulti.setInput(labelMulti.copy(isResponse = true), featuresMulti.copy(isResponse = true)) - } should have message "The feature vector should not contain any response features." - - val modelMulti = randomForestMulti.fit(multiClassTestData) - val transformedDataMulti = modelMulti.transform(multiClassTestData) - val fieldsMulti = transformedDataMulti.select(rawNameMulti, - probNameMulti, predNameMulti).schema.fields - fieldsMulti.map(_.name).toList shouldBe List(rawNameMulti, probNameMulti, predNameMulti) - fieldsMulti.map(_.dataType.typeName).toList shouldBe List("vector", "vector", "double") - } -} diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/AliasTransformerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/AliasTransformerTest.scala index f02198bc53..3f15feea46 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/AliasTransformerTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/AliasTransformerTest.scala @@ -31,31 +31,42 @@ package com.salesforce.op.stages.impl.feature -import com.salesforce.op.stages.FeatureGeneratorStage +import com.salesforce.op.features.types._ import com.salesforce.op.stages.base.binary.BinaryLambdaTransformer -import com.salesforce.op.test.PassengerSparkFixtureTest -import org.apache.spark.ml.param.ParamMap +import com.salesforce.op.test.{OpTransformerSpec, TestFeatureBuilder} +import com.salesforce.op.utils.spark.RichDataset._ +import com.salesforce.op.utils.tuples.RichTuple._ import org.junit.runner.RunWith -import org.scalatest.FlatSpec import org.scalatest.junit.JUnitRunner @RunWith(classOf[JUnitRunner]) -class AliasTransformerTest extends FlatSpec with PassengerSparkFixtureTest { +class AliasTransformerTest extends OpTransformerSpec[RealNN, AliasTransformer[RealNN]] { + val sample = Seq((RealNN(1.0), RealNN(2.0)), (RealNN(4.0), RealNN(4.0))) + val (inputData, f1, f2) = TestFeatureBuilder(sample) + val transformer = new AliasTransformer(name = "feature").setInput(f1) + val expectedResult: Seq[RealNN] = sample.map(_._1) - Spec[AliasTransformer[_]] should "allow aliasing features" in { - val myFeature = (weight / height).alias - myFeature.name shouldBe "myFeature" - val all = myFeature.originStage.asInstanceOf[BinaryLambdaTransformer[_, _, _]] - - val transformed = all.transform(passengersDataSet) - transformed.columns.contains(myFeature.name) shouldBe true + it should "have a shortcut that changes feature name on a raw feature" in { + val feature = f1.alias + feature.name shouldBe "feature" + feature.originStage shouldBe a[AliasTransformer[_]] + val origin = feature.originStage.asInstanceOf[AliasTransformer[RealNN]] + val transformed = origin.transform(inputData) + transformed.collect(feature) shouldEqual expectedResult } - - it should "copy successfully" in { - val myFeature = ((weight * 2) / height).alias - val copy = myFeature.originStage.copy(new ParamMap()) - copy.uid shouldBe myFeature.originStage.uid + it should "have a shortcut that changes feature name on a derived feature" in { + val feature = (f1 / f2).alias + feature.name shouldBe "feature" + feature.originStage shouldBe a[BinaryLambdaTransformer[_, _, _]] + val origin = feature.originStage.asInstanceOf[BinaryLambdaTransformer[_, _, _]] + val transformed = origin.transform(inputData) + transformed.columns should contain (feature.name) + transformed.collect(feature) shouldEqual sample.map { case (v1, v2) => (v1.v -> v2.v).map(_ / _).toRealNN(0.0) } + } + it should "have a shortcut that changes feature name on a derived wrapped feature" in { + val feature = f1.toIsotonicCalibrated(label = f2).alias + feature.name shouldBe "feature" + feature.originStage shouldBe a[AliasTransformer[_]] } - } diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/Base64VectorizerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/Base64VectorizerTest.scala index 85e5daa8d4..814a416eb2 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/Base64VectorizerTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/Base64VectorizerTest.scala @@ -49,7 +49,8 @@ class Base64VectorizerTest extends FlatSpec with TestSparkContext with Base64Tes val result = new OpWorkflow().setResultFeatures(vec).transform(randomData) result.collect(vec) should contain theSameElementsInOrderAs - OPVector(Vectors.dense(0.0, 0.0)) +: Array.fill(expectedRandom.length - 1)(OPVector(Vectors.dense(1.0, 0.0))) + OPVector(Vectors.dense(0.0, 0.0)) +: + Array.fill(expectedRandom.length - 1)(OPVector(Vectors.dense(1.0, 0.0))) } it should "vectorize some real binary content" in { val vec = realBase64.vectorize(topK = 10, minSupport = 0, cleanText = true) diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/BinaryMapVectorizerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/BinaryMapVectorizerTest.scala index 4fc0e6a91b..8ff482ad48 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/BinaryMapVectorizerTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/BinaryMapVectorizerTest.scala @@ -32,77 +32,65 @@ package com.salesforce.op.stages.impl.feature import com.salesforce.op.features.types._ -import com.salesforce.op.test.TestOpVectorColumnType.{IndCol, IndColWithGroup, IndVal} -import com.salesforce.op.test.{TestFeatureBuilder, TestOpVectorMetadataBuilder, TestSparkContext} -import com.salesforce.op.utils.spark.{OpVectorColumnMetadata, OpVectorMetadata} +import com.salesforce.op.stages.base.sequence.SequenceModel +import com.salesforce.op.test.TestOpVectorColumnType.IndColWithGroup +import com.salesforce.op.test.{OpEstimatorSpec, TestFeatureBuilder, TestOpVectorMetadataBuilder} import com.salesforce.op.utils.spark.RichDataset._ +import com.salesforce.op.utils.spark.{OpVectorColumnMetadata, OpVectorMetadata} import org.apache.spark.ml.linalg.Vectors import org.junit.runner.RunWith import org.scalatest.junit.JUnitRunner -import org.scalatest.{Assertions, FlatSpec, Matchers} + @RunWith(classOf[JUnitRunner]) -class BinaryMapVectorizerTest extends FlatSpec with TestSparkContext { +class BinaryMapVectorizerTest + extends OpEstimatorSpec[OPVector, SequenceModel[BinaryMap, OPVector], BinaryMapVectorizer[BinaryMap]] { - lazy val (data, m1, m2) = TestFeatureBuilder("m1", "m2", + val (inputData, m1, m2) = TestFeatureBuilder("m1", "m2", Seq( (Map("a" -> false, "b" -> true), Map("z" -> false)), (Map("c" -> false), Map("y" -> true, "x" -> true)), (Map.empty[String, Boolean], Map.empty[String, Boolean]) ).map(v => v._1.toBinaryMap -> v._2.toBinaryMap) ) - val vectorizer = new BinaryMapVectorizer().setInput(m1, m2).setCleanKeys(true) - /** - * Note that defaults and filters are tested in [[RealMapVectorizerTest]] - * as that code is shared between the two classes - */ - Spec[BinaryMapVectorizer[_]] should "take an array of features as input and return a single vector feature" in { - val vector = vectorizer.getOutput() + val estimator = new BinaryMapVectorizer().setTrackNulls(false).setCleanKeys(true).setInput(m1, m2) - vector.name shouldBe vectorizer.getOutputFeatureName - vector.parents should contain theSameElementsAs Array(m1, m2) - vector.originStage shouldBe vectorizer - vector.typeName shouldBe FeatureType.typeName[OPVector] - vector.isResponse shouldBe false - } + val expectedResult: Seq[OPVector] = Seq( + Vectors.sparse(6, Array(1), Array(1.0)), + Vectors.sparse(6, Array(4, 5), Array(1.0, 1.0)), + Vectors.sparse(6, Array(), Array()) + ).map(_.toOPVector) - it should "return a model that correctly transforms the data" in { - val transformed = vectorizer.setTrackNulls(false).fit(data).transform(data) - val vector = vectorizer.getOutput() - val expected = Array( - Vectors.sparse(6, Array(1), Array(1.0)), - Vectors.sparse(6, Array(4, 5), Array(1.0, 1.0)), - Vectors.sparse(6, Array(), Array()) - ).map(_.toOPVector) + it should "return a model that correctly transforms the data and produces metadata" in { + val transformed = model.transform(inputData) + val vector = estimator.getOutput() val expectedMeta = TestOpVectorMetadataBuilder( - vectorizer, - m1 -> List(IndColWithGroup(None, "A"), IndColWithGroup(None, "B"), - IndColWithGroup(None, "C")), + estimator, + m1 -> List(IndColWithGroup(None, "A"), IndColWithGroup(None, "B"), IndColWithGroup(None, "C")), m2 -> List(IndColWithGroup(None, "Z"), IndColWithGroup(None, "Y"), IndColWithGroup(None, "X")) ) - transformed.collect(vector) shouldBe expected - val field = transformed.schema(vectorizer.getOutputFeatureName) + transformed.collect(vector) shouldBe expectedResult + val field = transformed.schema(estimator.getOutputFeatureName) OpVectorMetadata(field) shouldEqual expectedMeta - val vectorMetadata = vectorizer.getMetadata() + val vectorMetadata = estimator.getMetadata() OpVectorMetadata(field.copy(metadata = vectorMetadata)) shouldEqual expectedMeta } - it should " track nulls" in { - val transformed = vectorizer.setTrackNulls(true).fit(data).transform(data) - val vector = vectorizer.getOutput() + it should "return a model that correctly transforms the data and produces metadata with null tracking" in { + val transformed = estimator.setTrackNulls(true).fit(inputData).transform(inputData) + val vector = estimator.getOutput() val expected = Array( Vectors.sparse(12, Array(2, 5, 9, 11), Array(1.0, 1.0, 1.0, 1.0)), Vectors.sparse(12, Array(1, 3, 7, 8, 10), Array(1.0, 1.0, 1.0, 1.0, 1.0)), Vectors.sparse(12, Array(1, 3, 5, 7, 9, 11), Array(1.0, 1.0, 1.0, 1.0, 1.0, 1.0)) ).map(_.toOPVector) - val nullIndicatorValue = Some(OpVectorColumnMetadata.NullString) val expectedMeta = TestOpVectorMetadataBuilder( - vectorizer, + estimator, m1 -> List(IndColWithGroup(None, "A"), IndColWithGroup(nullIndicatorValue, "A"), IndColWithGroup(None, "B"), IndColWithGroup(nullIndicatorValue, "B"), IndColWithGroup(None, "C"), IndColWithGroup(nullIndicatorValue, "C")), @@ -112,10 +100,9 @@ class BinaryMapVectorizerTest extends FlatSpec with TestSparkContext { ) transformed.collect(vector) shouldBe expected - val field = transformed.schema(vectorizer.getOutputFeatureName) + val field = transformed.schema(estimator.getOutputFeatureName) OpVectorMetadata(field) shouldEqual expectedMeta - val vectorMetadata = vectorizer.getMetadata() + val vectorMetadata = estimator.getMetadata() OpVectorMetadata(field.copy(metadata = vectorMetadata)) shouldEqual expectedMeta } - } diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/BinaryVectorizerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/BinaryVectorizerTest.scala index cc24908ddd..f7277ee642 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/BinaryVectorizerTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/BinaryVectorizerTest.scala @@ -33,19 +33,18 @@ package com.salesforce.op.stages.impl.feature import com.salesforce.op.features.types._ import com.salesforce.op.test.TestOpVectorColumnType.{IndCol, RootCol} -import com.salesforce.op.test.{TestFeatureBuilder, TestOpVectorMetadataBuilder, TestSparkContext} +import com.salesforce.op.test.{OpTransformerSpec, TestFeatureBuilder, TestOpVectorMetadataBuilder} import com.salesforce.op.utils.spark.OpVectorMetadata import com.salesforce.op.utils.spark.RichDataset._ import org.apache.spark.ml.linalg.Vectors import org.junit.runner.RunWith import org.scalatest.junit.JUnitRunner -import org.scalatest.{Assertions, FlatSpec, Matchers} @RunWith(classOf[JUnitRunner]) -class BinaryVectorizerTest extends FlatSpec with TestSparkContext { +class BinaryVectorizerTest extends OpTransformerSpec[OPVector, BinaryVectorizer] { - val (ds, f1, f2) = TestFeatureBuilder( + val (inputData, f1, f2) = TestFeatureBuilder( Seq[(Binary, Binary)]( (Binary(false), Binary(false)), (Binary(false), Binary(true)), @@ -59,17 +58,23 @@ class BinaryVectorizerTest extends FlatSpec with TestSparkContext { ) ) - Spec[BinaryVectorizer] should "take an array of features as input and return a single vector feature" in { - val vectorizer = new BinaryVectorizer().setInput(f1, f2) - val vector = vectorizer.getOutput() - vector.name shouldBe vectorizer.getOutputFeatureName - vector.typeName shouldBe FeatureType.typeName[OPVector] - vector.isResponse shouldBe false - } + val transformer = new BinaryVectorizer().setInput(f1, f2) // default settings: trackNulls = true, setFillValue = false + + val expectedResult = Seq( + Array(0.0, 0.0, 0.0, 0.0), + Array(0.0, 0.0, 1.0, 0.0), + Array(1.0, 0.0, 0.0, 0.0), + Array(1.0, 0.0, 1.0, 0.0), + Array(0.0, 1.0, 0.0, 0.0), + Array(0.0, 1.0, 1.0, 0.0), + Array(0.0, 0.0, 0.0, 1.0), + Array(1.0, 0.0, 0.0, 1.0), + Array(0.0, 1.0, 0.0, 1.0) + ).map(Vectors.dense(_).toOPVector) it should "transform the data correctly [trackNulls=true,fillValue=false]" in { val vectorizer = new BinaryVectorizer().setInput(f1, f2).setTrackNulls(true).setFillValue(false) - val transformed = vectorizer.transform(ds) + val transformed = vectorizer.transform(inputData) val vector = vectorizer.getOutput() val expected = Array( Array(0.0, 0.0, 0.0, 0.0), @@ -93,7 +98,7 @@ class BinaryVectorizerTest extends FlatSpec with TestSparkContext { it should "transform the data correctly [trackNulls=true,fillValue=true]" in { val vectorizer = new BinaryVectorizer().setInput(f1, f2).setTrackNulls(true).setFillValue(true) - val transformed = vectorizer.transform(ds) + val transformed = vectorizer.transform(inputData) val vector = vectorizer.getOutput() val expected = Array( Array(0.0, 0.0, 0.0, 0.0), @@ -117,7 +122,7 @@ class BinaryVectorizerTest extends FlatSpec with TestSparkContext { it should "transform the data correctly [trackNulls=false,fillValue=false]" in { val vectorizer = new BinaryVectorizer().setInput(f1, f2).setTrackNulls(false).setFillValue(false) - val transformed = vectorizer.transform(ds) + val transformed = vectorizer.transform(inputData) val vector = vectorizer.getOutput() val expected = Array( Array(0.0, 0.0), @@ -141,7 +146,7 @@ class BinaryVectorizerTest extends FlatSpec with TestSparkContext { it should "transform the data correctly [trackNulls=false,fillValue=true]" in { val vectorizer = new BinaryVectorizer().setInput(f1, f2).setTrackNulls(false).setFillValue(true) - val transformed = vectorizer.transform(ds) + val transformed = vectorizer.transform(inputData) val vector = vectorizer.getOutput() val expected = Array( Array(0.0, 0.0), diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/DateListVectorizerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/DateListVectorizerTest.scala index 2706d20ba1..58436bba1e 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/DateListVectorizerTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/DateListVectorizerTest.scala @@ -31,23 +31,21 @@ package com.salesforce.op.stages.impl.feature -import com.salesforce.op.features.Feature import com.salesforce.op.features.types._ import com.salesforce.op.stages.impl.feature.DateListPivot._ import com.salesforce.op.test.TestOpVectorColumnType.IndCol -import com.salesforce.op.test.{TestFeatureBuilder, TestOpVectorMetadataBuilder, TestSparkContext} +import com.salesforce.op.test.{OpTransformerSpec, TestFeatureBuilder, TestOpVectorMetadataBuilder} import com.salesforce.op.utils.date.DateTimeUtils import com.salesforce.op.utils.spark.OpVectorMetadata import com.salesforce.op.utils.spark.RichDataset._ import org.apache.spark.ml.linalg.Vectors import org.joda.time.{DateTime, DateTimeConstants} import org.junit.runner.RunWith -import org.scalatest.FlatSpec import org.scalatest.junit.JUnitRunner @RunWith(classOf[JUnitRunner]) -class DateListVectorizerTest extends FlatSpec with TestSparkContext { +class DateListVectorizerTest extends OpTransformerSpec[OPVector, DateListVectorizer[DateList]] { // Sunday July 12th 1998 at 22:45 val defaultDate = new DateTime(1998, 7, 12, 22, 45, DateTimeUtils.DefaultTimeZone).getMillis @@ -94,23 +92,15 @@ class DateListVectorizerTest extends FlatSpec with TestSparkContext { val testVectorizer = new DateListVectorizer[DateList]() val outputName = "vecDateList" - Spec[DateListVectorizer[_]] should "have output name set correctly" in { - testVectorizer.operationName shouldBe outputName - } + // OpTransformer base tests + val inputData = testDataCurrent - it should "throw an error if you try to get the output without setting the inputs" in { - intercept[java.util.NoSuchElementException](testVectorizer.getOutput()) - } + val transformer = new DateListVectorizer[DateList]().setInput(clicks, opens, purchases) - it should "return a single output feature of the correct type" in { - val output = testVectorizer.setInput(clicks, opens, purchases).getOutput() - output shouldBe new Feature[OPVector]( - name = testVectorizer.getOutputFeatureName, - originStage = testVectorizer, - isResponse = false, - parents = Array(clicks, opens, purchases) - ) - } + val expectedResult = Seq(Vectors.dense(0.0, 1.0, 0.0, 1.0, 0.0, 1.0).toOPVector, + Vectors.dense(3.0, 0.0, 0.0, 0.0, 0.0, 0.0).toOPVector, + Vectors.dense(0.0, 0.0, 0.0, 0.0, 0.0, 0.0).toOPVector, + Vectors.dense(2.0, 0.0, 1.0, 0.0, -1.0, 0.0).toOPVector) it should "vectorize with SinceFirst" in { val testModelTimeSinceFirst = testVectorizer.setInput(clicks, opens, purchases).setPivot(SinceFirst) diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/DateToUnitCircleTransformerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/DateToUnitCircleTransformerTest.scala index 486c32523b..d88eb87bba 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/DateToUnitCircleTransformerTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/DateToUnitCircleTransformerTest.scala @@ -34,19 +34,17 @@ package com.salesforce.op.stages.impl.feature import com.salesforce.op._ import com.salesforce.op.features.types._ import com.salesforce.op.stages.impl.feature.TimePeriod._ -import com.salesforce.op.test.{TestFeatureBuilder, TestSparkContext} +import com.salesforce.op.test.{OpTransformerSpec, TestFeatureBuilder} import com.salesforce.op.utils.spark.OpVectorMetadata import com.salesforce.op.utils.spark.RichDataset._ - import org.apache.spark.ml.linalg.Vectors import org.apache.spark.ml.Transformer import org.joda.time.{DateTime => JDateTime} import org.junit.runner.RunWith -import org.scalatest.FlatSpec import org.scalatest.junit.JUnitRunner @RunWith(classOf[JUnitRunner]) -class DateToUnitCircleTransformerTest extends FlatSpec with TestSparkContext { +class DateToUnitCircleTransformerTest extends OpTransformerSpec[OPVector, DateToUnitCircleTransformer[Date]] { val eps = 1E-4 val sampleDateTimes = Seq[JDateTime]( @@ -56,13 +54,12 @@ class DateToUnitCircleTransformerTest extends FlatSpec with TestSparkContext { new JDateTime(2017, 4, 17, 18, 0, 0, 0), new JDateTime(1918, 2, 13, 3, 0, 0, 0) ) - val expectedHourOfDayOutput = Array( - Array(1.0, 0.0), - Array(0.0, 1.0), - Array(-1.0, 0.0), - Array(0.0, -1.0), - Array(math.sqrt(2.0) / 2, math.sqrt(2.0) / 2) - ).map(Vectors.dense(_).toOPVector) + + val (inputData, f1) = TestFeatureBuilder(sampleDateTimes.map(x => Date(x.getMillis))) + + val transformer = new DateToUnitCircleTransformer().setInput(f1) + + val expectedResult: Seq[OPVector] = transformData(sampleDateTimes, HourOfDay) def transformData[T <: TimePeriod](data: Seq[JDateTime], timePeriod: T): Array[OPVector] = { val dataTimeStamps: Seq[Date] = data.map(x => Date(x.getMillis())) @@ -78,24 +75,13 @@ class DateToUnitCircleTransformerTest extends FlatSpec with TestSparkContext { .map(Vectors.dense(_).toOPVector) } - Spec[DateToUnitCircleTransformer[_]] should - "take an array of features as input and return a single vector feature" in { - val dataTimeStamps: Seq[Date] = sampleDateTimes.map(x => Date(x.getMillis())) - val (ds, f) = TestFeatureBuilder(dataTimeStamps) - val vectorizer = new DateToUnitCircleTransformer().setInput(f) - val vector = vectorizer.getOutput() - vector.name shouldBe vectorizer.getOutputFeatureName - vector.typeName shouldBe FeatureType.typeName[OPVector] - vector.isResponse shouldBe false - } - it should "work with its shortcut" in { val dataTimeStamps: Seq[Date] = sampleDateTimes.map(x => Date(x.getMillis())) val (ds, dateFeature) = TestFeatureBuilder(dataTimeStamps) val output = dateFeature.toUnitCircle(TimePeriod.HourOfDay) val transformed = output.originStage.asInstanceOf[Transformer].transform(ds) val actual = transformed.collect(output) - all (actual.zip(expectedHourOfDayOutput).map(g => Vectors.sqdist(g._1.value, g._2.value))) should be < eps + all (actual.zip(expectedResult).map(g => Vectors.sqdist(g._1.value, g._2.value))) should be < eps } it should "work with its DateTime shortcut" in { @@ -104,7 +90,7 @@ class DateToUnitCircleTransformerTest extends FlatSpec with TestSparkContext { val output = dateTimeFeature.toUnitCircle(TimePeriod.HourOfDay) val transformed = output.originStage.asInstanceOf[Transformer].transform(ds) val actual = transformed.collect(output) - all (actual.zip(expectedHourOfDayOutput).map(g => Vectors.sqdist(g._1.value, g._2.value))) should be < eps + all (actual.zip(expectedResult).map(g => Vectors.sqdist(g._1.value, g._2.value))) should be < eps } it should "store the proper meta data" in { @@ -137,7 +123,7 @@ class DateToUnitCircleTransformerTest extends FlatSpec with TestSparkContext { it should "transform the data correctly when the timePeriod is HourOfDay" in { val actual = transformData(sampleDateTimes, HourOfDay) - all (actual.zip(expectedHourOfDayOutput).map(g => Vectors.sqdist(g._1.value, g._2.value))) should be < eps + all (actual.zip(expectedResult).map(g => Vectors.sqdist(g._1.value, g._2.value))) should be < eps } it should "transform the data correctly when the timePeriod is DayOfYear" in { diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/DecisionTreeNumericBucketizerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/DecisionTreeNumericBucketizerTest.scala index f0807b0d16..c2d59429a6 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/DecisionTreeNumericBucketizerTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/DecisionTreeNumericBucketizerTest.scala @@ -249,3 +249,4 @@ object DecisionTreeNumericBucketizerTestHelper extends Matchers { } } + diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/DecisionTreeNumericMapBucketizerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/DecisionTreeNumericMapBucketizerTest.scala index d3c2f3f4f7..5784a95021 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/DecisionTreeNumericMapBucketizerTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/DecisionTreeNumericMapBucketizerTest.scala @@ -35,7 +35,9 @@ import com.salesforce.op.OpWorkflow import com.salesforce.op.features.types._ import com.salesforce.op.test.{TestFeatureBuilder, TestSparkContext} import com.salesforce.op.testkit.{RandomBinary, RandomReal} +import com.salesforce.op.utils.spark.OpVectorMetadata import com.salesforce.op.utils.spark.RichDataset._ +import com.salesforce.op.utils.spark.RichMetadata._ import org.apache.spark.sql.DataFrame import org.junit.runner.RunWith import org.scalatest.FlatSpec @@ -81,6 +83,17 @@ class DecisionTreeNumericMapBucketizerTest extends FlatSpec with TestSparkContex val expectedSplits = Array(Double.NegativeInfinity, 15, 26, 91, Double.PositiveInfinity) } + lazy val (data, target, currencyMap, realMap) = TestFeatureBuilder("target", "currencyMap", "realMap2", + Seq[(RealNN, CurrencyMap, RealMap)]( + (1.0.toRealNN, CurrencyMap(Map("c0" -> 10)), RealMap.empty), + (1.0.toRealNN, CurrencyMap(Map("c0" -> 10)), RealMap.empty), + (1.0.toRealNN, CurrencyMap(Map("c0" -> 8)), RealMap.empty), + (0.0.toRealNN, CurrencyMap(Map("c0" -> 5)), RealMap.empty), + (0.0.toRealNN, CurrencyMap(Map("c0" -> 3)), RealMap.empty), + (0.0.toRealNN, CurrencyMap(Map("c0" -> 0)), RealMap.empty) + ) + ) + Spec[DecisionTreeNumericMapBucketizer[_, _]] should "produce output that is never a response, " + "except the case where both inputs are" in new NormalData { Seq( @@ -143,6 +156,24 @@ class DecisionTreeNumericMapBucketizerTest extends FlatSpec with TestSparkContex ) } + it should "drop empty numeric map" in { + val targetResponse = target.copy(isResponse = true) + val currencyMapBkts = currencyMap.autoBucketize(label = targetResponse, trackNulls = false, minInfoGain = 0.1) + val realMapBkts = realMap.autoBucketize(label = targetResponse, trackNulls = false, minInfoGain = 0.1) + val featureVector = Seq(currencyMapBkts, realMapBkts).transmogrify(Some(targetResponse)) + + val transformed = new OpWorkflow().setResultFeatures(currencyMapBkts, realMapBkts, featureVector).transform(data) + + // featureVector should consist of bucketized features from currencyMap and no feature from realMap + val featureVectorMeta = OpVectorMetadata(transformed.schema(featureVector.name)) + featureVectorMeta.columns.length shouldBe 2 + featureVectorMeta.columns.foreach{ col => + col.parentFeatureName should contain theSameElementsAs Seq("currencyMap") + col.parentFeatureType should contain theSameElementsAs Seq("com.salesforce.op.features.types.CurrencyMap") + col.indicatorGroup shouldBe Some("c0") + } + } + private def assertBucketizer ( bucketizer: DecisionTreeNumericMapBucketizer[_, _ <: OPMap[_]], diff --git a/core/src/test/scala/com/salesforce/op/stages/base/sequence/SequenceTransformerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/FilterIntegralMapTest.scala similarity index 52% rename from core/src/test/scala/com/salesforce/op/stages/base/sequence/SequenceTransformerTest.scala rename to core/src/test/scala/com/salesforce/op/stages/impl/feature/FilterIntegralMapTest.scala index d9f42ca61c..83ff87113f 100644 --- a/core/src/test/scala/com/salesforce/op/stages/base/sequence/SequenceTransformerTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/FilterIntegralMapTest.scala @@ -29,42 +29,59 @@ * POSSIBILITY OF SUCH DAMAGE. */ -package com.salesforce.op.stages.base.sequence +package com.salesforce.op.stages.impl.feature -import com.salesforce.op.test.PassengerSparkFixtureTest import com.salesforce.op.features.types._ +import com.salesforce.op.test.{OpTransformerSpec, TestFeatureBuilder} import com.salesforce.op.utils.spark.RichDataset._ -import com.salesforce.op.utils.spark.RichRow._ -import org.apache.spark.ml.param.ParamMap import org.junit.runner.RunWith import org.scalatest.junit.JUnitRunner -import org.scalatest.{FlatSpec, Matchers} + @RunWith(classOf[JUnitRunner]) -class SequenceTransformerTest extends FlatSpec with PassengerSparkFixtureTest { +class FilterIntegralMapTest extends OpTransformerSpec[IntegralMap, FilterMap[IntegralMap]] { + + val (inputData, f1Int) = TestFeatureBuilder[IntegralMap]( + Seq( + IntegralMap(Map("Arthur" -> 1, "Lancelot" -> 2, "Galahad" -> 3)), + IntegralMap(Map("Lancelot" -> 2, "Galahad" -> 3, "Bedevere" -> 4)), + IntegralMap(Map("Knight" -> 5)) + ) + ) + val transformer = new FilterMap[IntegralMap]().setInput(f1Int) - val toMP = new SequenceLambdaTransformer[Real, MultiPickList](operationName = "MP", - transformFn = value => MultiPickList(value.map(_.v.getOrElse(0.0).toString).toSet) + val expectedResult: Seq[IntegralMap] = Seq( + IntegralMap(Map("Arthur" -> 1, "Lancelot" -> 2, "Galahad" -> 3)), + IntegralMap(Map("Lancelot" -> 2, "Galahad" -> 3, "Bedevere" -> 4)), + IntegralMap(Map("Knight" -> 5)) ) - Spec[SequenceLambdaTransformer[_, _]] should "work when returning a MultiPickList feature" in { - toMP.setInput(age, weight) - val transformedData = toMP.transform(passengersDataSet) - val columns = transformedData.columns - assert(columns.contains(toMP.getOutputFeatureName)) - val output = toMP.getOutput() - val answer = passengersArray.map(r => - toMP.transformFn(Seq(r.getFeatureType[Real](age), r.getFeatureType[Real](weight))) + it should "filter by whitelisted keys" in { + transformer.setWhiteListKeys(Array("Arthur", "Knight")) + val filtered = transformer.transform(inputData).collect(transformer.getOutput()) + + val dataExpected = Array( + IntegralMap(Map("Arthur" -> 1)), + IntegralMap.empty, + IntegralMap(Map("Knight" -> 5)) ) - transformedData.collect(output) shouldBe answer + + filtered should contain theSameElementsAs dataExpected } - it should "copy successfully" in { - val tr = new SequenceLambdaTransformer[Text, Text]( - operationName = "foo", - transformFn = x => x.head + it should "filter by blacklisted keys" in { + transformer.setInput(f1Int) + .setWhiteListKeys(Array[String]()) + .setBlackListKeys(Array("Arthur", "Knight")) + val filtered = transformer.transform(inputData).collect(transformer.getOutput()) + + val dataExpected = Array( + IntegralMap(Map("Lancelot" -> 2, "Galahad" -> 3)), + IntegralMap(Map("Lancelot" -> 2, "Galahad" -> 3, "Bedevere" -> 4)), + IntegralMap.empty ) - tr.copy(new ParamMap()).uid shouldBe tr.uid + + filtered should contain theSameElementsAs dataExpected } } diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/FilterMapTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/FilterMapTest.scala deleted file mode 100644 index 06fcf23bbc..0000000000 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/FilterMapTest.scala +++ /dev/null @@ -1,224 +0,0 @@ -/* - * Copyright (c) 2017, Salesforce.com, Inc. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - * - * 3. Neither the name of Salesforce.com nor the names of its contributors may - * be used to endorse or promote products derived from this software without - * specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ - -package com.salesforce.op.stages.impl.feature - -import com.salesforce.op._ -import com.salesforce.op.features.types._ -import com.salesforce.op.test.{TestFeatureBuilder, TestSparkContext} -import com.salesforce.op.utils.spark.RichDataset._ -import org.junit.runner.RunWith -import org.scalatest.junit.JUnitRunner -import org.scalatest.{Assertions, FlatSpec, Matchers} - - -@RunWith(classOf[JUnitRunner]) -class FilterMapTest extends FlatSpec with TestSparkContext { - - val (ds, f1) = TestFeatureBuilder[TextMap]( - Seq( - TextMap(Map("Arthur" -> "King", "Lancelot" -> "Brave", "Galahad" -> "Pure")), - TextMap(Map("Lancelot" -> "Brave", "Galahad" -> "Pure", "Bedevere" -> "Wise")), - TextMap(Map("Knight" -> "Ni")) - ) - ) - - val filter = new FilterMap[TextMap]().setInput(f1) - - - val (dsInt, f1Int) = TestFeatureBuilder[IntegralMap]( - Seq( - IntegralMap(Map("Arthur" -> 1, "Lancelot" -> 2, "Galahad" -> 3)), - IntegralMap(Map("Lancelot" -> 2, "Galahad" -> 3, "Bedevere" -> 4)), - IntegralMap(Map("Knight" -> 5)) - ) - ) - val filterInt = new FilterMap[IntegralMap]().setInput(f1Int) - - - val (dsCat, f1Cat) = TestFeatureBuilder[MultiPickListMap]( - Seq( - MultiPickListMap(Map("Arthur" -> Set("King", "Briton"), - "Lancelot" -> Set("Brave", "Knight"), - "Galahad" -> Set("Pure", "Knight"))), - MultiPickListMap(Map("Lancelot" -> Set("Brave", "Knight"), - "Galahad" -> Set("Pure", "Knight"), - "Bedevere" -> Set("Wise", "Knight"))), - MultiPickListMap(Map("Knight" -> Set("Ni", "Ekke Ekke Ekke Ekke Ptang Zoo Boing"))) - ) - ) - val filterCat = new FilterMap[MultiPickListMap]().setInput(f1Cat) - - - classOf[FilterMap[_]].getSimpleName should "return single properly formed feature" in { - val filtered = filter.getOutput() - - filtered.name shouldBe filter.getOutputFeatureName - filtered.originStage shouldBe filter - filtered.parents shouldBe Array(f1) - } - - it should "filter TextMap by whitelisted keys" in { - filter.setWhiteListKeys(Array("Arthur", "Knight")) - - val filtered = filter.transform(ds).collect(filter.getOutput) - val dataExpected = Array( - TextMap(Map("Arthur" -> "King")), - TextMap.empty, - TextMap(Map("Knight" -> "Ni")) - ) - - filtered should contain theSameElementsAs dataExpected - } - - it should "filter TextMap by blacklisted keys" in { - filter.setInput(f1) - .setWhiteListKeys(Array[String]()) - .setBlackListKeys(Array("Arthur", "Knight")) - val filtered = filter.transform(ds).collect(filter.getOutput) - - val dataExpected = Array( - TextMap(Map("Lancelot" -> "Brave", "Galahad" -> "Pure")), - TextMap(Map("Lancelot" -> "Brave", "Galahad" -> "Pure", "Bedevere" -> "Wise")), - TextMap.empty - ) - - filtered should contain theSameElementsAs dataExpected - } - - it should "filter IntegralMap by whitelisted keys" in { - filterInt.setWhiteListKeys(Array("Arthur", "Knight")) - val filtered = filterInt.transform(dsInt).collect(filterInt.getOutput()) - - val dataExpected = Array( - IntegralMap(Map("Arthur" -> 1)), - IntegralMap.empty, - IntegralMap(Map("Knight" -> 5)) - ) - - filtered should contain theSameElementsAs dataExpected - } - - it should "filter IntegralMap by blacklisted keys" in { - filterInt.setInput(f1Int) - .setWhiteListKeys(Array[String]()) - .setBlackListKeys(Array("Arthur", "Knight")) - val filtered = filterInt.transform(dsInt).collect(filterInt.getOutput()) - - val dataExpected = Array( - IntegralMap(Map("Lancelot" -> 2, "Galahad" -> 3)), - IntegralMap(Map("Lancelot" -> 2, "Galahad" -> 3, "Bedevere" -> 4)), - IntegralMap.empty - ) - - filtered should contain theSameElementsAs dataExpected - } - - it should "filter MultiPickListMap by whitelisted keys" in { - filterCat.setWhiteListKeys(Array("Arthur", "Knight")) - val filtered = filterCat.transform(dsCat).collect(filterCat.getOutput()) - - val dataExpected = Array( - MultiPickListMap(Map("Arthur" -> Set("King", "Briton"))), - MultiPickListMap.empty, - MultiPickListMap(Map("Knight" -> Set("Ni", "EkkeEkkeEkkeEkkePtangZooBoing"))) - ) - - filtered should contain theSameElementsAs dataExpected - } - - it should "filter MultiPickListMap by blacklisted keys" in { - filterCat - .setWhiteListKeys(Array[String]()) - .setBlackListKeys(Array("Arthur", "Knight")) - - val filtered = filterCat.transform(dsCat).collect(filterCat.getOutput()) - - val dataExpected = Array( - MultiPickListMap(Map("Lancelot" -> Set("Brave", "Knight"), - "Galahad" -> Set("Pure", "Knight"))), - MultiPickListMap(Map("Lancelot" -> Set("Brave", "Knight"), - "Galahad" -> Set("Pure", "Knight"), - "Bedevere" -> Set("Wise", "Knight"))), - MultiPickListMap.empty - ) - - filtered should contain theSameElementsAs dataExpected - } - - it should "filter correctly when using shortcut" in { - val filtered = f1.filter(whiteList = Seq("Arthur", "Knight"), blackList = Seq()) - - filtered.name shouldBe filtered.originStage.getOutputFeatureName - filtered.originStage shouldBe a[FilterMap[_]] - filtered.parents shouldBe Array(f1) - } - - it should "set cleanMapFlag correctly" in { - filter.setCleanText(false) - filter.get[Boolean](filter.cleanText).get shouldBe false - filter.setCleanKeys(false) - filter.get[Boolean](filter.cleanKeys).get shouldBe false - } - - it should "not clean map when flag set to false" in { - filterCat - .setCleanText(false) - .setCleanKeys(false) - .setWhiteListKeys(Array("Arthur", "Knight")) - .setBlackListKeys(Array()) - val filtered = filterCat.transform(dsCat).collect(filterCat.getOutput()) - - val dataExpected = Array( - MultiPickListMap(Map("Arthur" -> Set("King", "Briton"))), - MultiPickListMap.empty, - MultiPickListMap(Map("Knight" -> Set("Ni", "Ekke Ekke Ekke Ekke Ptang Zoo Boing"))) - ) - filtered should contain theSameElementsAs dataExpected - } - - it should "clean map when flag set to true" in { - filterCat - .setCleanKeys(true) - .setCleanText(true) - .setWhiteListKeys(Array("Arthur", "Knight")) - .setBlackListKeys(Array()) - val filtered = filterCat.transform(dsCat).collect(filterCat.getOutput()) - - val dataExpected = Array( - MultiPickListMap(Map("Arthur" -> Set("King", "Briton"))), - MultiPickListMap.empty, - MultiPickListMap(Map("Knight" -> Set("Ni", "EkkeEkkeEkkeEkkePtangZooBoing"))) - ) - filtered should contain theSameElementsAs dataExpected - } - -} diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/FilterMultiPickListMapTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/FilterMultiPickListMapTest.scala new file mode 100644 index 0000000000..b4a98f5e92 --- /dev/null +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/FilterMultiPickListMapTest.scala @@ -0,0 +1,130 @@ +/* + * Copyright (c) 2017, Salesforce.com, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of Salesforce.com nor the names of its contributors may + * be used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +package com.salesforce.op.stages.impl.feature + +import com.salesforce.op.features.types._ +import com.salesforce.op.test.{OpTransformerSpec, TestFeatureBuilder} +import com.salesforce.op.utils.spark.RichDataset._ +import org.junit.runner.RunWith +import org.scalatest.junit.JUnitRunner + + +@RunWith(classOf[JUnitRunner]) +class FilterMultiPickListMapTest extends OpTransformerSpec[MultiPickListMap, FilterMap[MultiPickListMap]] { + val (inputData, f1Cat) = TestFeatureBuilder[MultiPickListMap]( + Seq( + MultiPickListMap(Map("Arthur" -> Set("King", "Briton"), + "Lancelot" -> Set("Brave", "Knight"), + "Galahad" -> Set("Pure", "Knight"))), + MultiPickListMap(Map("Lancelot" -> Set("Brave", "Knight"), + "Galahad" -> Set("Pure", "Knight"), + "Bedevere" -> Set("Wise", "Knight"))), + MultiPickListMap(Map("Knight" -> Set("Ni", "Ekke Ekke Ekke Ekke Ptang Zoo Boing"))) + ) + ) + val transformer = new FilterMap[MultiPickListMap]().setInput(f1Cat) + + val expectedResult = Seq( + MultiPickListMap(Map("Arthur" -> Set("King", "Briton"), + "Lancelot" -> Set("Brave", "Knight"), + "Galahad" -> Set("Pure", "Knight"))), + MultiPickListMap(Map("Lancelot" -> Set("Brave", "Knight"), + "Galahad" -> Set("Pure", "Knight"), + "Bedevere" -> Set("Wise", "Knight"))), + MultiPickListMap(Map("Knight" -> Set("Ni", "EkkeEkkeEkkeEkkePtangZooBoing"))) + ) + + it should "filter whitelisted keys" in { + transformer.setWhiteListKeys(Array("Arthur", "Knight")) + val filtered = transformer.transform(inputData).collect(transformer.getOutput()) + + val dataExpected = Array( + MultiPickListMap(Map("Arthur" -> Set("King", "Briton"))), + MultiPickListMap.empty, + MultiPickListMap(Map("Knight" -> Set("Ni", "EkkeEkkeEkkeEkkePtangZooBoing"))) + ) + + filtered should contain theSameElementsAs dataExpected + } + + it should "filter blacklisted keys" in { + transformer + .setWhiteListKeys(Array[String]()) + .setBlackListKeys(Array("Arthur", "Knight")) + + val filtered = transformer.transform(inputData).collect(transformer.getOutput()) + + val dataExpected = Array( + MultiPickListMap(Map("Lancelot" -> Set("Brave", "Knight"), + "Galahad" -> Set("Pure", "Knight"))), + MultiPickListMap(Map("Lancelot" -> Set("Brave", "Knight"), + "Galahad" -> Set("Pure", "Knight"), + "Bedevere" -> Set("Wise", "Knight"))), + MultiPickListMap.empty + ) + + filtered should contain theSameElementsAs dataExpected + } + + it should "not clean map when flag set to false" in { + transformer + .setCleanText(false) + .setCleanKeys(false) + .setWhiteListKeys(Array("Arthur", "Knight")) + .setBlackListKeys(Array()) + val filtered = transformer.transform(inputData).collect(transformer.getOutput()) + + val dataExpected = Array( + MultiPickListMap(Map("Arthur" -> Set("King", "Briton"))), + MultiPickListMap.empty, + MultiPickListMap(Map("Knight" -> Set("Ni", "Ekke Ekke Ekke Ekke Ptang Zoo Boing"))) + ) + filtered should contain theSameElementsAs dataExpected + } + + it should "clean map when flag set to true" in { + transformer + .setCleanKeys(true) + .setCleanText(true) + .setWhiteListKeys(Array("Arthur", "Knight")) + .setBlackListKeys(Array()) + val filtered = transformer.transform(inputData).collect(transformer.getOutput()) + + val dataExpected = Array( + MultiPickListMap(Map("Arthur" -> Set("King", "Briton"))), + MultiPickListMap.empty, + MultiPickListMap(Map("Knight" -> Set("Ni", "EkkeEkkeEkkeEkkePtangZooBoing"))) + ) + filtered should contain theSameElementsAs dataExpected + } + +} diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/FilterTextMapTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/FilterTextMapTest.scala new file mode 100644 index 0000000000..3cdae3044f --- /dev/null +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/FilterTextMapTest.scala @@ -0,0 +1,101 @@ +/* + * Copyright (c) 2017, Salesforce.com, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of Salesforce.com nor the names of its contributors may + * be used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +package com.salesforce.op.stages.impl.feature + +import com.salesforce.op._ +import com.salesforce.op.features.types._ +import com.salesforce.op.test.{OpTransformerSpec, TestFeatureBuilder} +import com.salesforce.op.utils.spark.RichDataset._ +import org.junit.runner.RunWith +import org.scalatest.junit.JUnitRunner + +@RunWith(classOf[JUnitRunner]) +class FilterTextMapTest extends OpTransformerSpec[TextMap, FilterMap[TextMap]] { + val (inputData, f1) = TestFeatureBuilder[TextMap]( + Seq( + TextMap(Map("Arthur" -> "King", "Lancelot" -> "Brave", "Galahad" -> "Pure")), + TextMap(Map("Lancelot" -> "Brave", "Galahad" -> "Pure", "Bedevere" -> "Wise")), + TextMap(Map("Knight" -> "Ni")) + ) + ) + + val transformer = new FilterMap[TextMap]().setInput(f1) + + val expectedResult: Seq[TextMap] = Array( + TextMap(Map("Arthur" -> "King", "Lancelot" -> "Brave", "Galahad" -> "Pure")), + TextMap(Map("Lancelot" -> "Brave", "Galahad" -> "Pure", "Bedevere" -> "Wise")), + TextMap(Map("Knight" -> "Ni")) + ) + + it should "filter whitelisted keys" in { + transformer.setWhiteListKeys(Array("Arthur", "Knight")) + + val filtered = transformer.transform(inputData).collect(transformer.getOutput) + val dataExpected = Array( + TextMap(Map("Arthur" -> "King")), + TextMap.empty, + TextMap(Map("Knight" -> "Ni")) + ) + + filtered should contain theSameElementsAs dataExpected + } + + it should "filter blacklisted keys" in { + transformer.setInput(f1) + .setWhiteListKeys(Array[String]()) + .setBlackListKeys(Array("Arthur", "Knight")) + val filtered = transformer.transform(inputData).collect(transformer.getOutput) + + val dataExpected = Array( + TextMap(Map("Lancelot" -> "Brave", "Galahad" -> "Pure")), + TextMap(Map("Lancelot" -> "Brave", "Galahad" -> "Pure", "Bedevere" -> "Wise")), + TextMap.empty + ) + + filtered should contain theSameElementsAs dataExpected + } + + it should "set cleanMapFlag correctly" in { + transformer.setCleanText(false) + transformer.get[Boolean](transformer.cleanText).get shouldBe false + transformer.setCleanKeys(false) + transformer.get[Boolean](transformer.cleanKeys).get shouldBe false + } + + it should "filter correctly when using shortcut" in { + val filtered = f1.filter(whiteList = Seq("Arthur", "Knight"), blackList = Seq()) + + filtered.name shouldBe filtered.originStage.getOutputFeatureName + filtered.originStage shouldBe a[FilterMap[_]] + filtered.parents shouldBe Array(f1) + } +} diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/JaccardSimilarityTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/JaccardSimilarityTest.scala index dbc7ea26ff..2e301a2a93 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/JaccardSimilarityTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/JaccardSimilarityTest.scala @@ -33,17 +33,15 @@ package com.salesforce.op.stages.impl.feature import com.salesforce.op._ import com.salesforce.op.features.types._ -import com.salesforce.op.test.{TestFeatureBuilder, TestSparkContext} -import com.salesforce.op.utils.spark.RichDataset._ +import com.salesforce.op.test.{OpTransformerSpec, TestFeatureBuilder} import org.junit.runner.RunWith -import org.scalatest.FlatSpec import org.scalatest.junit.JUnitRunner @RunWith(classOf[JUnitRunner]) -class JaccardSimilarityTest extends FlatSpec with TestSparkContext { +class JaccardSimilarityTest extends OpTransformerSpec[RealNN, JaccardSimilarity] { - val (ds, f1, f2) = TestFeatureBuilder( + val (inputData, f1, f2) = TestFeatureBuilder( Seq( (Seq("Red", "Green"), Seq("Red")), (Seq("Red", "Green"), Seq("Yellow, Blue")), @@ -51,15 +49,10 @@ class JaccardSimilarityTest extends FlatSpec with TestSparkContext { ).map(v => v._1.toMultiPickList -> v._2.toMultiPickList) ) - val jacSimTrans = new JaccardSimilarity().setInput(f1, f2) + val transformer = new JaccardSimilarity().setInput(f1, f2) - classOf[JaccardSimilarity].getSimpleName should "return single properly formed feature" in { - val jaccard = jacSimTrans.getOutput() + val expectedResult: Seq[RealNN] = Seq(0.5, 0.0, 1.0).toRealNN - jaccard.name shouldBe jacSimTrans.getOutputFeatureName - jaccard.parents shouldBe Array(f1, f2) - jaccard.originStage shouldBe jacSimTrans - } it should "have a shortcut" in { val jaccard = f1.jaccardSimilarity(f2) @@ -70,32 +63,25 @@ class JaccardSimilarityTest extends FlatSpec with TestSparkContext { it should "return 1 when both vectors are empty" in { val set1 = Seq.empty[String].toMultiPickList val set2 = Seq.empty[String].toMultiPickList - jacSimTrans.transformFn(set1, set2) shouldBe 1.0.toRealNN + transformer.transformFn(set1, set2) shouldBe 1.0.toRealNN } it should "return 1 when both vectors are the same" in { val set1 = Seq("Red", "Blue", "Green").toMultiPickList val set2 = Seq("Red", "Blue", "Green").toMultiPickList - jacSimTrans.transformFn(set1, set2) shouldBe 1.0.toRealNN + transformer.transformFn(set1, set2) shouldBe 1.0.toRealNN } it should "calculate similarity correctly when vectors are different" in { val set1 = Seq("Red", "Green", "Blue").toMultiPickList val set2 = Seq("Red", "Blue").toMultiPickList - jacSimTrans.transformFn(set1, set2) shouldBe (2.0 / 3.0).toRealNN + transformer.transformFn(set1, set2) shouldBe (2.0 / 3.0).toRealNN val set3 = Seq("Red").toMultiPickList val set4 = Seq("Blue").toMultiPickList - jacSimTrans.transformFn(set3, set4) shouldBe 0.0.toRealNN + transformer.transformFn(set3, set4) shouldBe 0.0.toRealNN val set5 = Seq("Red", "Yellow", "Green").toMultiPickList val set6 = Seq("Pink", "Green", "Blue").toMultiPickList - jacSimTrans.transformFn(set5, set6) shouldBe (1.0 / 5.0).toRealNN - } - - it should "calculate similarity correctly on a dataset" in { - val transformed = jacSimTrans.transform(ds) - val output = jacSimTrans.getOutput() - val actualOutput = transformed.collect(output) - actualOutput shouldBe Seq(0.5, 0.0, 1.0).toRealNN + transformer.transformFn(set5, set6) shouldBe (1.0 / 5.0).toRealNN } } diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/LangDetectorTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/LangDetectorTest.scala index 837174f9c5..f4c3cb360a 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/LangDetectorTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/LangDetectorTest.scala @@ -32,20 +32,19 @@ package com.salesforce.op.stages.impl.feature import com.salesforce.op.features.types._ -import com.salesforce.op.test.{TestFeatureBuilder, TestSparkContext} +import com.salesforce.op.test.{OpTransformerSpec, TestFeatureBuilder} import com.salesforce.op.utils.spark.RichDataset._ import com.salesforce.op.utils.text.Language import org.apache.spark.ml.Transformer import org.junit.runner.RunWith import org.scalatest.junit.JUnitRunner -import org.scalatest.{Assertions, FlatSpec, Matchers} @RunWith(classOf[JUnitRunner]) -class LangDetectorTest extends FlatSpec with TestSparkContext { +class LangDetectorTest extends OpTransformerSpec[RealMap, LangDetector[Text]] { // scalastyle:off - val (ds, f1, f2, f3) = TestFeatureBuilder( + val (inputData, f1, f2, f3) = TestFeatureBuilder( Seq( ( "I've got a lovely bunch of coconuts".toText, @@ -65,37 +64,31 @@ class LangDetectorTest extends FlatSpec with TestSparkContext { ) ) // scalastyle:on - val langDetector = new LangDetector[Text]().setInput(f1) + val transformer = new LangDetector[Text]().setInput(f1) - classOf[LangDetector[_]].getSimpleName should "return single properly formed feature" in { - val output1 = langDetector.getOutput() + private val langMap = f1.detectLanguages() - output1.name shouldBe langDetector.getOutputFeatureName - output1.parents shouldBe Array(f1) - output1.originStage shouldBe langDetector - } + // English result + val expectedResult: Seq[RealMap] = Seq( + Map("en" -> 0.9999984360934321), + Map("en" -> 0.9999900853228016), + Map("en" -> 0.9999900116744931) + ).map(_.toRealMap) it should "return empty RealMap when input text is empty" in { - langDetector.transformFn(Text.empty) shouldBe RealMap.empty - } - - it should "detect English language" in { - assertDetectionResults( - results = langDetector.setInput(f1).transform(ds).collect(langDetector.getOutput()), - expectedLanguage = Language.English - ) + transformer.transformFn(Text.empty) shouldBe RealMap.empty } it should "detect Japanese language" in { assertDetectionResults( - results = langDetector.setInput(f2).transform(ds).collect(langDetector.getOutput()), + results = transformer.setInput(f2).transform(inputData).collect(transformer.getOutput()), expectedLanguage = Language.Japanese ) } it should "detect French language" in { assertDetectionResults( - results = langDetector.setInput(f3).transform(ds).collect(langDetector.getOutput()), + results = transformer.setInput(f3).transform(inputData).collect(transformer.getOutput()), expectedLanguage = Language.French ) } @@ -104,7 +97,7 @@ class LangDetectorTest extends FlatSpec with TestSparkContext { val tokenized = f1.detectLanguages() assertDetectionResults( - results = tokenized.originStage.asInstanceOf[Transformer].transform(ds).collect(tokenized), + results = tokenized.originStage.asInstanceOf[Transformer].transform(inputData).collect(tokenized), expectedLanguage = Language.English ) } diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/MimeTypeDetectorTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/MimeTypeDetectorTest.scala index cb26746d74..a0eb2a7c1c 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/MimeTypeDetectorTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/MimeTypeDetectorTest.scala @@ -36,19 +36,21 @@ import java.io.FileInputStream import com.salesforce.op._ import com.salesforce.op.features.types._ import com.salesforce.op.stages.base.unary.UnaryTransformer -import com.salesforce.op.test.{TestFeatureBuilder, TestSparkContext} +import com.salesforce.op.test.{OpTransformerSpec, TestFeatureBuilder, TestSparkContext} import com.salesforce.op.testkit.RandomText import com.salesforce.op.utils.spark.RichDataset._ import org.apache.commons.io.IOUtils import org.junit.runner.RunWith -import org.scalatest.FlatSpec import org.scalatest.junit.JUnitRunner @RunWith(classOf[JUnitRunner]) -class MimeTypeDetectorTest extends FlatSpec with TestSparkContext with Base64TestData { +class MimeTypeDetectorTest extends OpTransformerSpec[Text, MimeTypeDetector] with Base64TestData { + val inputData = randomData + val transformer = new MimeTypeDetector().setInput(randomBase64) + val expectedResult = expectedRandom - Spec[MimeTypeDetector] should "validate the type hint" in { + it should "validate the type hint" in { assertThrows[IllegalArgumentException](new MimeTypeDetector().setTypeHint("blarg")) } it should "validate the ma bytes to parse" in { @@ -73,9 +75,9 @@ class MimeTypeDetectorTest extends FlatSpec with TestSparkContext with Base64Tes result.collect(mime) should contain theSameElementsInOrderAs expectedMimeJson } - } + trait Base64TestData { self: TestSparkContext => diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/NameEntityRecognizerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/NameEntityRecognizerTest.scala new file mode 100644 index 0000000000..d891ebe261 --- /dev/null +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/NameEntityRecognizerTest.scala @@ -0,0 +1,100 @@ +/* + * Copyright (c) 2017, Salesforce.com, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of Salesforce.com nor the names of its contributors may + * be used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +package com.salesforce.op.stages.impl.feature + +import com.salesforce.op.features.types._ +import com.salesforce.op.utils.spark.RichDataset._ +import com.salesforce.op.test.{OpTransformerSpec, TestFeatureBuilder} +import com.salesforce.op.utils.text.Language +import org.junit.runner.RunWith +import org.scalatest.junit.JUnitRunner + +@RunWith(classOf[JUnitRunner]) +class NameEntityRecognizerTest extends OpTransformerSpec[MultiPickListMap, NameEntityRecognizer[Text]] { + + // Base tests + val (inputData, inputText) = TestFeatureBuilder(Seq( + ("Pierre Vinken, 61 years old, will join the board as a nonexecutive director Nov. 29. Mr. Vinken is " + + "chairman of Elsevier N.V., the Dutch publishing group. Rudolph Agnew, 55 years " + + "old and former chairman of Consolidated Gold Fields PLC, was named a director of this " + + "British industrial conglomerate.").toText)) + + val transformer = new NameEntityRecognizer[Text].setInput(inputText) + + val expectedResult: Seq[MultiPickListMap] = Seq( + Map("Rudolph" -> Set("Person"), + "Agnew" -> Set("Person"), + "Consolidated" -> Set("Organization"), + "Vinken" -> Set("Person"), + "Gold" -> Set("Organization"), + "PLC" -> Set("Organization"), + "Pierre" -> Set("Person"), + "Fields" -> Set("Organization") + ).toMultiPickListMap) + + it should "find the same set of name entities using the shortcut in RichTextFeatures" in { + val nameEntityRecognizer = inputText.recognizeEntities().originStage.asInstanceOf[NameEntityRecognizer[Text]] + .setInput(inputText) + val transformed = nameEntityRecognizer.transform(inputData) + val output = nameEntityRecognizer.getOutput() + transformed.collect(output) shouldEqual expectedResult + } + + it should "find name entities for Dutch text" in { + // scalastyle:off + val input = ("Pierre Vinken, 61 jaar oud, treedt toe tot het bestuur als een niet-uitvoerende " + + "directeur op Nov. 29. De heer Vinken is voorzitter van Elsevier N.V., de Nederlandse uitgeversgroep. " + + "Rudolph Agnew, 55 jaar oud en voormalig voorzitter van Consolidated Gold Fields PLC, werd benoemd tot " + + "bestuurder van dit Britse industriële conglomeraat.").toText + val expectedOutput = Map( + "Nederlandse" -> Set("Misc"), + "Nov." -> Set("Organization"), + "Consolidated" -> Set("Misc"), + "Vinken" -> Set("Person"), + "Pierre" -> Set("Person"), + "Britse" -> Set("Misc") + ).toMultiPickListMap + new NameEntityRecognizer[Text]().setDefaultLanguage(Language.Dutch).transformFn(input) shouldEqual expectedOutput + // scalastyle:on + } + + it should "return an empty map when there's no pre-trained name entity recognition model for the given language" in { + val input = ("Pierre Vinken, mwenye umri wa miaka 61, atajiunga na bodi hiyo kama mkurugenzi asiyetarajiwa " + + "Novemba 29. Mheshimiwa Vinken ni mwenyekiti wa Elsevier N.V., kundi la kuchapisha Kiholanzi. " + + "Rudolph Agnew, mwenye umri wa miaka 55 na mwenyekiti wa zamani wa Mkutano Mkuu wa Gold Fields, " + + "aliitwa mkurugenzi wa muungano huu wa viwanda wa Uingereza.").toText + val expectedOutput = Map.empty[String, Set[String]].toMultiPickListMap + new NameEntityRecognizer[Text]().setDefaultLanguage(Language.Swahili).transformFn(input) shouldEqual expectedOutput + } + + // TODO: add a test for spanish NER after finding the spanish tokenizer +} diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/NumericVectorizerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/NumericVectorizerTest.scala new file mode 100644 index 0000000000..541043fc34 --- /dev/null +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/NumericVectorizerTest.scala @@ -0,0 +1,127 @@ +/* + * Copyright (c) 2017, Salesforce.com, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of Salesforce.com nor the names of its contributors may + * be used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +package com.salesforce.op.stages.impl.feature + +import com.salesforce.op._ +import com.salesforce.op.OpWorkflow +import com.salesforce.op.features.Feature +import com.salesforce.op.features.types._ +import com.salesforce.op.stages.base.unary.UnaryLambdaTransformer +import com.salesforce.op.test.{FeatureTestBase, TestFeatureBuilder} +import com.salesforce.op.utils.spark.RichDataset._ +import com.salesforce.op.testkit.{RandomIntegral, RandomReal} +import org.apache.spark.ml.linalg.Vectors +import org.junit.runner.RunWith +import org.scalatest.FlatSpec +import org.scalatest.junit.JUnitRunner + + +@RunWith(classOf[JUnitRunner]) +class NumericVectorizerTest extends FlatSpec with FeatureTestBase { + + val ageData: Seq[Real] = RandomReal.uniform[Real](maxValue = 80.0).limit(100) + val heightData: Seq[Real] = RandomReal.normal[Real](mean = 65.0, sigma = 8).limit(100) + val countData: Seq[Integral] = RandomIntegral.integrals(0, 10).limit(100) + val labelTransformer = new UnaryLambdaTransformer[Real, RealNN](operationName = "labelFunc", + transformFn = { + case SomeValue(Some(x)) if x > 30.0 => 1.toRealNN + case _ => 0.0.toRealNN + } + ) + + Spec[RichRealFeature[_]] should "vectorize a small sample of real values" in { + val inputData = Seq(-4, -3, -2, -1, 1, 2, 3, 4).map(_.toReal) + val labelData = Seq(0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0).map(_.toRealNN) + val generatedData = inputData.zip(labelData) + val (ds, input, label) = TestFeatureBuilder("input", "label", generatedData) + val autoBucketFeature = Seq(input).transmogrify(label = Some(label.copy(isResponse = true))) + val vectorized = new OpWorkflow().setResultFeatures(autoBucketFeature).transform(ds) + // value col, null indicator col, bucket 0 indicator, bucket 1 indicator + val expected = Array( + Array(-4.0, 0.0, 1.0, 0.0), + Array(-3.0, 0.0, 1.0, 0.0), + Array(-2.0, 0.0, 1.0, 0.0), + Array(-1.0, 0.0, 1.0, 0.0), + Array(1.0, 0.0, 0.0, 1.0), + Array(2.0, 0.0, 0.0, 1.0), + Array(3.0, 0.0, 0.0, 1.0), + Array(4.0, 0.0, 0.0, 1.0) + ).map(Vectors.dense(_).toOPVector) + vectorized.collect(autoBucketFeature) should contain theSameElementsAs expected + } + it should "vectorize single real feature with a label" in { + val (ds, age) = TestFeatureBuilder("age", ageData) + val labelData = age.transformWith(labelTransformer).asInstanceOf[Feature[RealNN]].copy(isResponse = true) + val autoBucketFeature = Seq(age).transmogrify(label = Some(labelData)) + val manualBucketFeature = Seq( + age.vectorize(fillValue = 0, fillWithMean = true, trackNulls = true), + age.autoBucketize(labelData, trackNulls = false) + ).combine() + val vectorized = new OpWorkflow().setResultFeatures(autoBucketFeature, manualBucketFeature).transform(ds) + + for {(autoAge, manualAge) <- vectorized.collect(autoBucketFeature, manualBucketFeature)} { + autoAge.v.toArray should contain theSameElementsAs manualAge.v.toArray + } + } + it should "vectorize multiple real features with a label" in { + val generatedData: Seq[(Real, Real)] = ageData.zip(heightData) + val (ds, age, height) = TestFeatureBuilder("age", "height", generatedData) + val labelData = age.transformWith(labelTransformer).asInstanceOf[Feature[RealNN]].copy(isResponse = true) + val autoBucketFeature = Seq(age, height).transmogrify(label = Some(labelData)) + val manualBucketFeature = Seq( + age, age.autoBucketize(labelData, trackNulls = false), + height, height.autoBucketize(labelData, trackNulls = false) + ).transmogrify() + val vectorized = new OpWorkflow().setResultFeatures(autoBucketFeature, manualBucketFeature).transform(ds) + + for {(autoAge, manualAge) <- vectorized.collect(autoBucketFeature, manualBucketFeature)} { + autoAge.v.toArray should contain theSameElementsAs manualAge.v.toArray + } + } + Spec[RichIntegralFeature[_]] should "vectorize single integral feature with a label" in { + val (ds, count) = TestFeatureBuilder("count", countData) + val labelTransformer = new UnaryLambdaTransformer[Integral, RealNN](operationName = "labelFunc", + transformFn = { + case SomeValue(Some(x)) if x > 5 => 1.0.toRealNN + case _ => 0.0.toRealNN + } + ) + val labelData = labelTransformer.setInput(count).getOutput().asInstanceOf[Feature[RealNN]].copy(isResponse = true) + val autoBucketFeature = Seq(count).transmogrify(label = Some(labelData)) + val manualBucketFeature = Seq(count, count.autoBucketize(labelData, trackNulls = false)).transmogrify() + val vectorized = new OpWorkflow().setResultFeatures(autoBucketFeature, manualBucketFeature).transform(ds) + + for {(autoAge, manualAge) <- vectorized.collect(autoBucketFeature, manualBucketFeature)} { + autoAge.v.toArray should contain theSameElementsAs manualAge.v.toArray + } + } +} diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/OPMapVectorizerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/OPMapVectorizerTest.scala index 393cbc0266..57bb661453 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/OPMapVectorizerTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/OPMapVectorizerTest.scala @@ -463,3 +463,5 @@ object OPMapVectorizerTestHelper extends Matchers { } } + + diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/OPSetVectorizerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/OPSetVectorizerTest.scala index 31da7287d0..252aa4c994 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/OPSetVectorizerTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/OPSetVectorizerTest.scala @@ -266,8 +266,7 @@ class OpSetVectorizerTest extends FlatSpec with TestSparkContext { val vectorizedStage = untypedVectorizedStage.asInstanceOf[OpSetVectorizer[_]] val inputDF = TestOpWorkflowBuilder(df, vectorized).computeDataUpTo(vectorized) - val vectorizedDF = vectorizedStage.fit(inputDF).transform(inputDF) - val featArray = vectorizedDF.collect(vectorized) + val featArray = inputDF.collect(vectorized) featArray.foreach { opVec => opVec.value.size shouldBe 5 } } diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/SmartTextMapVectorizerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/SmartTextMapVectorizerTest.scala index 78923d31f1..4812f2adae 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/SmartTextMapVectorizerTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/SmartTextMapVectorizerTest.scala @@ -151,14 +151,16 @@ class SmartTextMapVectorizerTest extends FlatSpec with TestSparkContext { result.foreach{ case (vec1, vec2) => vec1 shouldBe vec2} } - it should "detect two non categorical text features" in { + it should "use separate hash space for each text feature" in { val smartMapVectorized = new SmartTextMapVectorizer[TextMap]() - .setMaxCardinality(2).setNumFeatures(4).setMinSupport(1).setTopK(2).setPrependFeatureName(true) + .setMaxCardinality(1).setNumFeatures(4).setMinSupport(1).setTopK(2).setPrependFeatureName(true) .setCleanKeys(false) + .setHashSpaceStrategy(HashSpaceStrategy.Separate) .setInput(m1, m2).getOutput() val smartVectorized = new SmartTextVectorizer() - .setMaxCardinality(2).setNumFeatures(4).setMinSupport(1).setTopK(2).setPrependFeatureName(true) + .setMaxCardinality(1).setNumFeatures(4).setMinSupport(1).setTopK(2).setPrependFeatureName(true) + .setHashSpaceStrategy(HashSpaceStrategy.Separate) .setInput(f1, f2).getOutput() val transformed = new OpWorkflow().setResultFeatures(smartMapVectorized, smartVectorized).transform(data) @@ -172,8 +174,76 @@ class SmartTextMapVectorizerTest extends FlatSpec with TestSparkContext { mapMeta.columns.zip(meta.columns).foreach{ case (m, f) => m.parentFeatureName shouldBe Array(m1.name) m.parentFeatureType shouldBe Array(m1.typeName) - if (m.index < 4) m.indicatorGroup shouldBe Option(f1.name) - else m.indicatorGroup shouldBe Option(f2.name) + if (m.index < 4 || m.index == 8) m.indicatorGroup shouldBe Option(f1.name) + else if (m.index < 8 || m.index == 9) m.indicatorGroup shouldBe Option(f2.name) + m.indicatorValue shouldBe f.indicatorValue + } + + result.foreach{ case (vec1, vec2) => vec1 shouldBe vec2} + } + + it should "use shared hash space for two text features" in { + val smartMapVectorized = new SmartTextMapVectorizer[TextMap]() + .setMaxCardinality(1).setMinSupport(1).setTopK(2).setPrependFeatureName(true) + .setCleanKeys(false) + .setNumFeatures(4).setHashSpaceStrategy(HashSpaceStrategy.Shared) + .setInput(m1, m2).getOutput() + + val smartVectorized = new SmartTextVectorizer() + .setMaxCardinality(1).setMinSupport(1).setTopK(2).setPrependFeatureName(true) + .setNumFeatures(4).setHashSpaceStrategy(HashSpaceStrategy.Shared) + .setInput(f1, f2).getOutput() + + val transformed = new OpWorkflow().setResultFeatures(smartMapVectorized, smartVectorized).transform(data) + val result = transformed.collect(smartMapVectorized, smartVectorized) + + val mapMeta = OpVectorMetadata(transformed.schema(smartMapVectorized.name)) + val meta = OpVectorMetadata(transformed.schema(smartVectorized.name)) + mapMeta.history.keys shouldBe Set(m1.name, m2.name) + mapMeta.columns.length shouldBe meta.columns.length + + mapMeta.columns.zip(meta.columns).foreach{ case (m, f) => + m.parentFeatureName shouldBe Array(m1.name) + m.parentFeatureType shouldBe Array(m1.typeName) + if (m.index == 4) { + assert(m.indicatorGroup === Option(f1.name), s"first null indicator should be from ${f1.name}") + } else if (m.index == 5) { + assert(m.indicatorGroup === Option(f2.name), s"second null indicator should be from ${f2.name}") + } + m.indicatorValue shouldBe f.indicatorValue + } + + result.foreach{ case (vec1, vec2) => vec1 shouldBe vec2} + } + + it should "use shared hash space for two text features again" in { + val smartMapVectorized = new SmartTextMapVectorizer[TextMap]() + .setMaxCardinality(1).setMinSupport(1).setTopK(2).setPrependFeatureName(true) + .setCleanKeys(false) + .setNumFeatures(TransmogrifierDefaults.MaxNumOfFeatures).setHashSpaceStrategy(HashSpaceStrategy.Auto) + .setInput(m1, m2).getOutput() + + val smartVectorized = new SmartTextVectorizer() + .setMaxCardinality(1).setMinSupport(1).setTopK(2).setPrependFeatureName(true) + .setNumFeatures(TransmogrifierDefaults.MaxNumOfFeatures).setHashSpaceStrategy(HashSpaceStrategy.Auto) + .setInput(f1, f2).getOutput() + + val transformed = new OpWorkflow().setResultFeatures(smartMapVectorized, smartVectorized).transform(data) + val result = transformed.collect(smartMapVectorized, smartVectorized) + + val mapMeta = OpVectorMetadata(transformed.schema(smartMapVectorized.name)) + val meta = OpVectorMetadata(transformed.schema(smartVectorized.name)) + mapMeta.history.keys shouldBe Set(m1.name, m2.name) + mapMeta.columns.length shouldBe meta.columns.length + + mapMeta.columns.zip(meta.columns).foreach{ case (m, f) => + m.parentFeatureName shouldBe Array(m1.name) + m.parentFeatureType shouldBe Array(m1.typeName) + if (m.index == TransmogrifierDefaults.MaxNumOfFeatures) { + assert(m.indicatorGroup === Option(f1.name), s"first null indicator should be from ${f1.name}") + } else if (m.index > TransmogrifierDefaults.MaxNumOfFeatures) { + assert(m.indicatorGroup === Option(f2.name), s"second null indicator should be from ${f2.name}") + } m.indicatorValue shouldBe f.indicatorValue } diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/TextMapNullEstimatorTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/TextMapNullEstimatorTest.scala index a91a677428..b8961248d9 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/TextMapNullEstimatorTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/TextMapNullEstimatorTest.scala @@ -47,8 +47,10 @@ class TextMapNullEstimatorTest extends FlatSpec with TestSparkContext { val (ds, f1) = TestFeatureBuilder( Seq[(TextMap)]( - TextMap(Map("k1" -> "A giraffe drinks by the watering hole", "k2" -> "Cheese")), - TextMap(Map("k2" -> "French Fries")), + TextMap(Map("k1" -> "A giraffe drinks by the watering hole", "k2" -> "Cheese", "k3" -> "Hello", "k4" -> "Bye")), + // scalastyle:off + TextMap(Map("k2" -> "French Fries", "k4" -> "\uA7BC\u10C8\u2829\u29BA\u23E1")), + // scalastyle:on TextMap(Map("k3" -> "Hip-hop Pottamus")) ) ) @@ -68,9 +70,9 @@ class TextMapNullEstimatorTest extends FlatSpec with TestSparkContext { val vector = vectorizer.getOutput() val expected = Array( - Array(0.0, 0.0, 1.0), - Array(1.0, 0.0, 1.0), - Array(1.0, 1.0, 0.0) + Array(0.0, 0.0, 0.0, 0.0), + Array(1.0, 0.0, 1.0, 1.0), + Array(1.0, 1.0, 0.0, 1.0) ).map(Vectors.dense(_).toOPVector) transformed.collect(vector) shouldBe expected @@ -80,7 +82,8 @@ class TextMapNullEstimatorTest extends FlatSpec with TestSparkContext { f1 -> List( IndColWithGroup(name = Option(TransmogrifierDefaults.NullString), groupName = "k1"), IndColWithGroup(name = Option(TransmogrifierDefaults.NullString), groupName = "k2"), - IndColWithGroup(name = Option(TransmogrifierDefaults.NullString), groupName = "k3") + IndColWithGroup(name = Option(TransmogrifierDefaults.NullString), groupName = "k3"), + IndColWithGroup(name = Option(TransmogrifierDefaults.NullString), groupName = "k4") ) ) } diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/TextTokenizerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/TextTokenizerTest.scala index 8bd3fcfa41..62c328c7b3 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/TextTokenizerTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/TextTokenizerTest.scala @@ -70,15 +70,15 @@ class TextTokenizerTest extends FlatSpec with TestSparkContext { trait English { val expected = Array( - List("i'v", "got", "love", "bunch", "coconut").toTextList, - List("all", "stand", "row").toTextList, - List("big", "on", "small", "on", "some", "big", "your", "head").toTextList, - List("bodi", "big", "on", "small", "h1", "on", "h1", "some", "big", "your", "head", "bodi").toTextList, + List("got", "love", "bunch", "coconut").toTextList, + List("stand", "row").toTextList, + List("big", "on", "small", "on", "big", "head").toTextList, + List("bodi", "big", "on", "small", "h1", "on", "h1", "big", "head", "bodi").toTextList, TextList.empty ) val expectedHtml = { val copy = expected.toList.toArray - copy(3) = List("big", "on", "small", "on", "some", "big", "your", "head").toTextList + copy(3) = List("big", "on", "small", "on", "big", "head").toTextList copy } } @@ -188,10 +188,10 @@ class TextTokenizerTest extends FlatSpec with TestSparkContext { input = english, tokenizer = tokenized.originStage.asInstanceOf[TextTokenizer[Text]], expected = Array( - List("i've", "got", "lovely", "bunch", "coconuts").toTextList, - List("all", "standing", "row").toTextList, - List("big", "ones", "small", "ones", "some", "big", "your", "head").toTextList, - List("body", "big", "ones", "small", "h1", "ones", "h1", "some", "big", "your", "head", "body").toTextList, + List("got", "lovely", "bunch", "coconuts").toTextList, + List("standing", "row").toTextList, + List("big", "ones", "small", "ones", "big", "head").toTextList, + List("body", "big", "ones", "small", "h1", "ones", "h1", "big", "head", "body").toTextList, TextList.empty ) ) diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/insights/RecordInsightsLOCOTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/insights/RecordInsightsLOCOTest.scala index c8c5ed7cc7..f777c9b0f8 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/insights/RecordInsightsLOCOTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/insights/RecordInsightsLOCOTest.scala @@ -32,19 +32,23 @@ package com.salesforce.op.stages.impl.insights import com.salesforce.op.FeatureHistory -import com.salesforce.op.stages.impl.classification.{OpLogisticRegression, OpRandomForest} -import com.salesforce.op.test.{TestFeatureBuilder, TestSparkContext} -import com.salesforce.op.testkit.{RandomIntegral, RandomReal, RandomVector} -import org.junit.runner.RunWith -import org.scalatest.FlatSpec -import org.scalatest.junit.JUnitRunner import com.salesforce.op.features.types._ +import com.salesforce.op.stages.impl.classification.{OpLogisticRegression, OpRandomForestClassifier} import com.salesforce.op.stages.impl.preparators.SanityCheckDataTest import com.salesforce.op.stages.impl.regression.OpLinearRegression +import com.salesforce.op.stages.sparkwrappers.generic.SparkWrapperParams +import com.salesforce.op.test.{TestFeatureBuilder, TestSparkContext} +import com.salesforce.op.testkit.{RandomIntegral, RandomReal, RandomVector} +import com.salesforce.op.utils.spark.RichDataset._ import com.salesforce.op.utils.spark.{OpVectorColumnMetadata, OpVectorMetadata} +import org.apache.spark.ml.classification.{LogisticRegressionModel, RandomForestClassificationModel} +import org.apache.spark.ml.regression.LinearRegressionModel import org.apache.spark.sql.DataFrame import org.apache.spark.sql.types.StructType -import com.salesforce.op.utils.spark.RichDataset._ +import org.junit.runner.RunWith +import org.scalatest.FlatSpec +import org.scalatest.junit.JUnitRunner + @RunWith(classOf[JUnitRunner]) class RecordInsightsLOCOTest extends FlatSpec with TestSparkContext { @@ -57,7 +61,11 @@ class RecordInsightsLOCOTest extends FlatSpec with TestSparkContext { val dfWithMeta = addMetaData(df, "features", 40) val sparkModel = new OpLogisticRegression().setInput(l1r, f1).fit(df) - val insightsTransformer = new RecordInsightsLOCO(sparkModel).setInput(f1) + val model = sparkModel.asInstanceOf[SparkWrapperParams[_]].getSparkMlStage().get + .asInstanceOf[LogisticRegressionModel] + + // val model = sparkModel.getSparkMlStage().get + val insightsTransformer = new RecordInsightsLOCO(model).setInput(f1) val insights = insightsTransformer.transform(dfWithMeta).collect(insightsTransformer.getOutput()) insights.foreach(_.value.size shouldBe 20) val parsed = insights.map(RecordInsightsParser.parseInsights) @@ -71,9 +79,11 @@ class RecordInsightsLOCOTest extends FlatSpec with TestSparkContext { val (df, f1, l1) = TestFeatureBuilder("features", "labels", features.zip(labels)) val l1r = l1.copy(isResponse = true) val dfWithMeta = addMetaData(df, "features", 40) - val sparkModel = new OpRandomForest().setInput(l1r, f1).fit(df) + val sparkModel = new OpRandomForestClassifier().setInput(l1r, f1).fit(df) + val model = sparkModel.asInstanceOf[SparkWrapperParams[_]].getSparkMlStage().get + .asInstanceOf[RandomForestClassificationModel] - val insightsTransformer = new RecordInsightsLOCO(sparkModel).setInput(f1).setTopK(2) + val insightsTransformer = new RecordInsightsLOCO(model).setInput(f1).setTopK(2) val insights = insightsTransformer.transform(dfWithMeta).collect(insightsTransformer.getOutput()) insights.foreach(_.value.size shouldBe 2) val parsed = insights.map(RecordInsightsParser.parseInsights) @@ -93,8 +103,10 @@ class RecordInsightsLOCOTest extends FlatSpec with TestSparkContext { val l1r = l1.copy(isResponse = true) val dfWithMeta = addMetaData(df, "features", 40) val sparkModel = new OpLinearRegression().setInput(l1r, f1).fit(df) + val model = sparkModel.asInstanceOf[SparkWrapperParams[_]].getSparkMlStage().get + .asInstanceOf[LinearRegressionModel] - val insightsTransformer = new RecordInsightsLOCO(sparkModel).setInput(f1) + val insightsTransformer = new RecordInsightsLOCO(model).setInput(f1) val insights = insightsTransformer.transform(dfWithMeta).collect(insightsTransformer.getOutput()) insights.foreach(_.value.size shouldBe 20) val parsed = insights.map(RecordInsightsParser.parseInsights) @@ -155,7 +167,9 @@ class RecordInsightsLOCOTest extends FlatSpec with TestSparkContext { val (testData, name, labelNoRes, featureVector) = TestFeatureBuilder("name", "label", "features", data) val label = labelNoRes.copy(isResponse = true) val testDataMeta = addMetaData(testData, "features", 5) - val model = new OpLogisticRegression().setInput(label, featureVector).fit(testData) + val sparkModel = new OpLogisticRegression().setInput(label, featureVector).fit(testData) + val model = sparkModel.asInstanceOf[SparkWrapperParams[_]].getSparkMlStage().get + .asInstanceOf[LogisticRegressionModel] val transformer = new RecordInsightsLOCO(model).setInput(featureVector) @@ -168,4 +182,4 @@ class RecordInsightsLOCOTest extends FlatSpec with TestSparkContext { parsed.foreach { case (_, in) => math.abs(in.head._2(0)._2 + in.head._2(1)._2) < 0.00001 shouldBe true } } -} \ No newline at end of file +} diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/preparators/BadFeatureZooTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/preparators/BadFeatureZooTest.scala index 9d0fb44f3b..c9d82b1d85 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/preparators/BadFeatureZooTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/preparators/BadFeatureZooTest.scala @@ -31,7 +31,7 @@ package com.salesforce.op.stages.impl.preparators -import com.salesforce.op.OpWorkflow +import com.salesforce.op.{OpWorkflow, UID} import com.salesforce.op.features.types._ import com.salesforce.op.features.{Feature, FeatureLike} import com.salesforce.op.stages.base.unary.UnaryLambdaTransformer @@ -53,6 +53,11 @@ class BadFeatureZooTest extends FlatSpec with TestSparkContext with Logging { // loggingLevel(Level.INFO) + override def beforeAll: Unit = { + super.beforeAll + UID.reset() + } + Spec[SanityChecker] should "correctly identify label leakage in PickList features using the Cramer's V criteria" + "when the label corresponds to a binary classification problem" in { // First set up the raw features: diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/preparators/SanityCheckerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/preparators/SanityCheckerTest.scala index 5995a5a402..9c1ed41b26 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/preparators/SanityCheckerTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/preparators/SanityCheckerTest.scala @@ -32,10 +32,11 @@ package com.salesforce.op.stages.impl.preparators import com.salesforce.op._ +import com.salesforce.op.features.FeatureLike import com.salesforce.op.features.types._ import com.salesforce.op.stages.MetadataParam import com.salesforce.op.stages.base.binary.BinaryModel -import com.salesforce.op.stages.impl.feature.RealNNVectorizer +import com.salesforce.op.stages.impl.feature.{HashSpaceStrategy, RealNNVectorizer, SmartTextMapVectorizer} import com.salesforce.op.test.{TestFeatureBuilder, TestSparkContext} import com.salesforce.op.utils.spark.RichMetadata._ import com.salesforce.op.utils.spark.{OpVectorColumnMetadata, OpVectorMetadata} @@ -60,9 +61,40 @@ case class SanityCheckDataTest case class SCDataTest(label: RealNN, features: OPVector) +case class TextRawData +( + id: String, + target: Double, + textMap: Map[String, String] +) + @RunWith(classOf[JUnitRunner]) class SanityCheckerTest extends FlatSpec with TestSparkContext { + private val textRawData = Seq( + TextRawData("0", 1.0, Map("color" -> "red", "fruit" -> "berry", "beverage" -> "tea")), + TextRawData("1", 1.0, Map("color" -> "orange", "fruit" -> "berry", "beverage" -> "coffee")), + TextRawData("2", 1.0, Map("color" -> "yello", "fruit" -> "berry", "beverage" -> "water")), + TextRawData("3", 1.0, Map("color" -> "green", "fruit" -> "berry")), + TextRawData("4", 1.0, Map("color" -> "blue", "fruit" -> "berry")), + TextRawData("5", 1.0, Map("color" -> "indigo", "fruit" -> "berry")), + TextRawData("6", 0.0, Map("fruit" -> "peach")), + TextRawData("7", 0.0, Map("fruit" -> "peach")), + TextRawData("8", 0.0, Map("fruit" -> "mango")), + TextRawData("9", 0.0, Map("beverage" -> "tea")), + TextRawData("10", 0.0, Map("beverage" -> "coffee")), + TextRawData("11", 0.0, Map("beverage" -> "water")) + ).map( textRawData => + ( + textRawData.id.toText, + textRawData.target.toRealNN, + textRawData.textMap.toTextMap + ) + ) + + val (textData, id, target, textMap) = TestFeatureBuilder("id", "target", "textMap", textRawData) + val targetResponse: FeatureLike[RealNN] = target.copy(isResponse = true) + // scalastyle:off private val data = Seq( SanityCheckDataTest("alex", 32, 5.0, 0, 1, 0.5, 0), @@ -314,6 +346,76 @@ class SanityCheckerTest extends FlatSpec with TestSparkContext { "requirement failed: The sanity checker has dropped all of your features, check your input data quality" } + it should "remove individual text hash features independently" in { + val smartMapVectorized = new SmartTextMapVectorizer[TextMap]() + .setMaxCardinality(2).setNumFeatures(8).setMinSupport(1).setTopK(2).setPrependFeatureName(true) + .setHashSpaceStrategy(HashSpaceStrategy.Shared) + .setInput(textMap).getOutput() + + val checkedFeatures = new SanityChecker() + .setCheckSample(1.0) + .setRemoveBadFeatures(true) + .setRemoveFeatureGroup(true) + .setProtectTextSharedHash(true) + .setMinCorrelation(0.0) + .setMaxCorrelation(0.8) + .setMaxCramersV(0.8) + .setInput(targetResponse, smartMapVectorized) + .getOutput() + + checkedFeatures.originStage shouldBe a[SanityChecker] + + val transformed = new OpWorkflow().setResultFeatures(smartMapVectorized, checkedFeatures).transform(textData) + + val featuresToDrop = Seq("textMap_4", "textMap_7", "textMap_color_NullIndicatorValue_8") + val featuresWithCorr = Seq("textMap_0", "textMap_1", "textMap_2", "textMap_3", "textMap_4", "textMap_5", + "textMap_6", "textMap_color_NullIndicatorValue_8", "textMap_fruit_NullIndicatorValue_9", + "textMap_beverage_NullIndicatorValue_10" + ) + val featuresWithNaNCorr = Seq("textMap_7") + + validateTransformerOutput(checkedFeatures.name, transformed, featuresWithCorr, featuresToDrop, featuresWithNaNCorr) + } + + it should "remove text hash features as groups" in { + val smartMapVectorized = new SmartTextMapVectorizer[TextMap]() + .setMaxCardinality(2).setNumFeatures(4).setMinSupport(1).setTopK(2).setPrependFeatureName(true) + .setHashSpaceStrategy(HashSpaceStrategy.Separate) + .setInput(textMap).getOutput() + + val checkedFeatures = new SanityChecker() + .setCheckSample(1.0) + .setRemoveBadFeatures(true) + .setRemoveFeatureGroup(true) + .setProtectTextSharedHash(true) + .setMinCorrelation(0.0) + .setMaxCorrelation(0.8) + .setMaxCramersV(0.8) + .setInput(targetResponse, smartMapVectorized) + .getOutput() + + checkedFeatures.originStage shouldBe a[SanityChecker] + + val transformed = new OpWorkflow().setResultFeatures(smartMapVectorized, checkedFeatures).transform(textData) + + val featuresToDrop = Seq("textMap_color_0", "textMap_color_1", "textMap_color_2", "textMap_color_3", + "textMap_fruit_4", "textMap_fruit_5", "textMap_fruit_6", "textMap_fruit_7", + "textMap_beverage_8", "textMap_beverage_9", + "textMap_color_NullIndicatorValue_12", "textMap_fruit_NullIndicatorValue_13" + ) + val featuresWithCorr = Seq("textMap_color_0", "textMap_color_3", + "textMap_fruit_5", "textMap_fruit_6", "textMap_fruit_7", + "textMap_beverage_10", "textMap_beverage_11", + "textMap_color_NullIndicatorValue_12", "textMap_fruit_NullIndicatorValue_13", + "textMap_beverage_NullIndicatorValue_14" + ) + val featuresWithNaNCorr = Seq("textMap_color_1", "textMap_color_2", "textMap_fruit_4", + "textMap_beverage_8", "textMap_beverage_9" + ) + + validateTransformerOutput(checkedFeatures.name, transformed, featuresWithCorr, featuresToDrop, featuresWithNaNCorr) + } + private def validateEstimatorOutput(outputColName: String, model: BinaryModel[RealNN, OPVector, OPVector], expectedFeaturesToDrop: Seq[String], label: String): Unit = { val metadata = model.getMetadata() diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/regression/OpDecisionTreeRegressorTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/regression/OpDecisionTreeRegressorTest.scala new file mode 100644 index 0000000000..1e0ae75d86 --- /dev/null +++ b/core/src/test/scala/com/salesforce/op/stages/impl/regression/OpDecisionTreeRegressorTest.scala @@ -0,0 +1,82 @@ +/* + * Copyright (c) 2017, Salesforce.com, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of Salesforce.com nor the names of its contributors may + * be used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +package com.salesforce.op.stages.impl.regression + +import com.salesforce.op.features.types._ +import com.salesforce.op.stages.impl.PredictionEquality +import com.salesforce.op.stages.sparkwrappers.specific.{OpPredictorWrapper, OpPredictorWrapperModel} +import com.salesforce.op.test._ +import org.apache.spark.ml.linalg.Vectors +import org.apache.spark.ml.regression.{DecisionTreeRegressionModel, DecisionTreeRegressor} +import org.junit.runner.RunWith +import org.scalatest.junit.JUnitRunner + +@RunWith(classOf[JUnitRunner]) +class OpDecisionTreeRegressorTest extends OpEstimatorSpec[Prediction, + OpPredictorWrapperModel[DecisionTreeRegressionModel], + OpPredictorWrapper[DecisionTreeRegressor, DecisionTreeRegressionModel]] with PredictionEquality { + + val (inputData, rawLabel, features) = TestFeatureBuilder( + Seq[(RealNN, OPVector)]( + (10.0.toRealNN, Vectors.dense(1.0, 4.3, 1.3).toOPVector), + (20.0.toRealNN, Vectors.dense(2.0, 0.3, 0.1).toOPVector), + (30.0.toRealNN, Vectors.dense(3.0, 3.9, 4.3).toOPVector), + (40.0.toRealNN, Vectors.dense(4.0, 1.3, 0.9).toOPVector), + (50.0.toRealNN, Vectors.dense(5.0, 4.7, 1.3).toOPVector) + ) + ) + val label = rawLabel.copy(isResponse = true) + val estimator = new OpDecisionTreeRegressor().setInput(label, features) + + val expectedResult = Seq( + Prediction(10.0), + Prediction(20.0), + Prediction(30.0), + Prediction(40.0), + Prediction(50.0) + ) + + it should "allow the user to set the desired spark parameters" in { + estimator + .setMaxDepth(6) + .setMaxBins(2) + .setMinInstancesPerNode(2) + .setMinInfoGain(0.1) + estimator.fit(inputData) + + estimator.predictor.getMaxDepth shouldBe 6 + estimator.predictor.getMaxBins shouldBe 2 + estimator.predictor.getMinInstancesPerNode shouldBe 2 + estimator.predictor.getMinInfoGain shouldBe 0.1 + + } +} diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/regression/OpGBTRegressorTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/regression/OpGBTRegressorTest.scala new file mode 100644 index 0000000000..dde440e5f9 --- /dev/null +++ b/core/src/test/scala/com/salesforce/op/stages/impl/regression/OpGBTRegressorTest.scala @@ -0,0 +1,82 @@ +/* + * Copyright (c) 2017, Salesforce.com, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of Salesforce.com nor the names of its contributors may + * be used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +package com.salesforce.op.stages.impl.regression + +import com.salesforce.op.features.types._ +import com.salesforce.op.stages.impl.PredictionEquality +import com.salesforce.op.stages.sparkwrappers.specific.{OpPredictorWrapper, OpPredictorWrapperModel} +import com.salesforce.op.test._ +import org.apache.spark.ml.linalg.Vectors +import org.apache.spark.ml.regression.{GBTRegressionModel, GBTRegressor} +import org.junit.runner.RunWith +import org.scalatest.junit.JUnitRunner + +@RunWith(classOf[JUnitRunner]) +class OpGBTRegressorTest extends OpEstimatorSpec[Prediction, OpPredictorWrapperModel[GBTRegressionModel], + OpPredictorWrapper[GBTRegressor, GBTRegressionModel]] with PredictionEquality { + + val (inputData, rawLabel, features) = TestFeatureBuilder( + Seq[(RealNN, OPVector)]( + (10.0.toRealNN, Vectors.dense(1.0, 4.3, 1.3).toOPVector), + (20.0.toRealNN, Vectors.dense(2.0, 0.3, 0.1).toOPVector), + (30.0.toRealNN, Vectors.dense(3.0, 3.9, 4.3).toOPVector), + (40.0.toRealNN, Vectors.dense(4.0, 1.3, 0.9).toOPVector), + (50.0.toRealNN, Vectors.dense(5.0, 4.7, 1.3).toOPVector) + ) + ) + val label = rawLabel.copy(isResponse = true) + val estimator = new OpGBTRegressor().setInput(label, features) + + val expectedResult = Seq( + Prediction(10.0), + Prediction(20.0), + Prediction(30.0), + Prediction(40.0), + Prediction(50.0) + ) + + it should "allow the user to set the desired spark parameters" in { + estimator + .setMaxIter(10) + .setMaxDepth(6) + .setMaxBins(2) + .setMinInstancesPerNode(2) + .setMinInfoGain(0.1) + estimator.fit(inputData) + + estimator.predictor.getMaxIter shouldBe 10 + estimator.predictor.getMaxDepth shouldBe 6 + estimator.predictor.getMaxBins shouldBe 2 + estimator.predictor.getMinInstancesPerNode shouldBe 2 + estimator.predictor.getMinInfoGain shouldBe 0.1 + } +} diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/regression/OpGeneralizedLinearRegressionTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/regression/OpGeneralizedLinearRegressionTest.scala new file mode 100644 index 0000000000..58cec6e044 --- /dev/null +++ b/core/src/test/scala/com/salesforce/op/stages/impl/regression/OpGeneralizedLinearRegressionTest.scala @@ -0,0 +1,84 @@ +/* + * Copyright (c) 2017, Salesforce.com, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of Salesforce.com nor the names of its contributors may + * be used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +package com.salesforce.op.stages.impl.regression + +import com.salesforce.op.features.types._ +import com.salesforce.op.stages.impl.PredictionEquality +import com.salesforce.op.stages.sparkwrappers.specific.{OpPredictorWrapper, OpPredictorWrapperModel} +import com.salesforce.op.test._ +import org.apache.spark.ml.linalg.Vectors +import org.apache.spark.ml.regression.{GeneralizedLinearRegression, GeneralizedLinearRegressionModel} +import org.junit.runner.RunWith +import org.scalatest.junit.JUnitRunner + +@RunWith(classOf[JUnitRunner]) +class OpGeneralizedLinearRegressionTest extends OpEstimatorSpec[Prediction, + OpPredictorWrapperModel[GeneralizedLinearRegressionModel], + OpPredictorWrapper[GeneralizedLinearRegression, GeneralizedLinearRegressionModel]] with PredictionEquality { + + val (inputData, rawLabel, features) = TestFeatureBuilder( + Seq[(RealNN, OPVector)]( + (10.0.toRealNN, Vectors.dense(1.0, 4.3, 1.3).toOPVector), + (20.0.toRealNN, Vectors.dense(2.0, 0.3, 0.1).toOPVector), + (30.0.toRealNN, Vectors.dense(3.0, 3.9, 4.3).toOPVector), + (40.0.toRealNN, Vectors.dense(4.0, 1.3, 0.9).toOPVector), + (50.0.toRealNN, Vectors.dense(5.0, 4.7, 1.3).toOPVector) + ) + ) + val label = rawLabel.copy(isResponse = true) + val estimator = new OpGeneralizedLinearRegression().setInput(label, features) + + val expectedResult = Seq( + Prediction(10.0, 9.99), + Prediction(20.0, 19.99), + Prediction(30.0, 29.99), + Prediction(40.0, 40.0), + Prediction(50.0, 50.0) + ) + + it should "allow the user to set the desired spark parameters" in { + estimator + .setMaxIter(10) + .setRegParam(0.1) + .setFitIntercept(true) + .setTol(1E-4) + .setSolver("normal") + estimator.fit(inputData) + + estimator.predictor.getMaxIter shouldBe 10 + estimator.predictor.getRegParam shouldBe 0.1 + estimator.predictor.getFitIntercept shouldBe true + estimator.predictor.getTol shouldBe 1E-4 + estimator.predictor.getSolver shouldBe "normal" + + } +} diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/regression/OpLinearRegressionTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/regression/OpLinearRegressionTest.scala index fa28f442e2..6efcf3232e 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/regression/OpLinearRegressionTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/regression/OpLinearRegressionTest.scala @@ -32,62 +32,53 @@ package com.salesforce.op.stages.impl.regression import com.salesforce.op.features.types._ -import com.salesforce.op.stages.impl.preparators.SanityChecker -import com.salesforce.op.stages.sparkwrappers.generic._ -import com.salesforce.op.test.{TestFeatureBuilder, TestSparkContext} +import com.salesforce.op.stages.base.binary.{BinaryEstimator, BinaryModel} +import com.salesforce.op.stages.impl.PredictionEquality +import com.salesforce.op.stages.sparkwrappers.specific.{OpPredictorWrapper, OpPredictorWrapperModel} +import com.salesforce.op.test._ import org.apache.spark.ml.linalg.Vectors -import org.apache.spark.ml.regression.LinearRegressionModel +import org.apache.spark.ml.regression.{LinearRegression, LinearRegressionModel} import org.junit.runner.RunWith import org.scalatest.junit.JUnitRunner -import org.scalatest.{FlatSpec, Matchers} @RunWith(classOf[JUnitRunner]) -class OpLinearRegressionTest extends FlatSpec with TestSparkContext { - val stageNames = Array[String]("LinearRegression_predictionCol") +class OpLinearRegressionTest extends OpEstimatorSpec[Prediction, OpPredictorWrapperModel[LinearRegressionModel], + OpPredictorWrapper[LinearRegression, LinearRegressionModel]] with PredictionEquality { - val (ds, rawLabel, features) = TestFeatureBuilder( + val (inputData, rawLabel, features) = TestFeatureBuilder( Seq[(RealNN, OPVector)]( (10.0.toRealNN, Vectors.dense(1.0, 4.3, 1.3).toOPVector), (20.0.toRealNN, Vectors.dense(2.0, 0.3, 0.1).toOPVector), (30.0.toRealNN, Vectors.dense(3.0, 3.9, 4.3).toOPVector), (40.0.toRealNN, Vectors.dense(4.0, 1.3, 0.9).toOPVector), (50.0.toRealNN, Vectors.dense(5.0, 4.7, 1.3).toOPVector) - ) + ) ) val label = rawLabel.copy(isResponse = true) - val linReg = new OpLinearRegression().setInput(label, features) - - Spec[OpLinearRegression] should "have output with correct origin stage" in { - val output = linReg.getOutput() - assert(output.originStage.isInstanceOf[SwBinaryEstimator[_, _, _, _, _]]) - the[IllegalArgumentException] thrownBy { - linReg.setInput(label.copy(isResponse = true), features.copy(isResponse = true)) - } should have message "The feature vector should not contain any response features." - } - - it should "return a properly formed LinearRegressionModel when fitted" in { - val model = linReg.setSparkParams("maxIter", 10).fit(ds) - assert(model.isInstanceOf[SwBinaryModel[RealNN, OPVector, RealNN, LinearRegressionModel]]) - - val sparkStage = model.getSparkMlStage() - - sparkStage.isDefined shouldBe true - sparkStage.get shouldBe a[LinearRegressionModel] - - val inputNames = model.getInputFeatures().map(_.name) - inputNames should have length 2 - inputNames shouldBe Array(label.name, features.name) - } + val estimator = new OpLinearRegression().setInput(label, features) + val expectedResult = Seq( + Prediction(10.0), + Prediction(20.0), + Prediction(30.0), + Prediction(40.0), + Prediction(50.0) + ) it should "allow the user to set the desired spark parameters" in { - linReg.setMaxIter(10).setRegParam(0.1) - linReg.getMaxIter shouldBe 10 - linReg.getRegParam shouldBe 0.1 + estimator + .setMaxIter(10) + .setRegParam(0.1) + .setFitIntercept(true) + .setElasticNetParam(0.1) + .setSolver("normal") + estimator.fit(inputData) + + estimator.predictor.getMaxIter shouldBe 10 + estimator.predictor.getRegParam shouldBe 0.1 + estimator.predictor.getFitIntercept shouldBe true + estimator.predictor.getElasticNetParam shouldBe 0.1 + estimator.predictor.getSolver shouldBe "normal" - linReg.setFitIntercept(true).setElasticNetParam(0.1).setSolver("normal") - linReg.getFitIntercept shouldBe true - linReg.getElasticNetParam shouldBe 0.1 - linReg.getSolver shouldBe "normal" } } diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/regression/OpRandomForestRegressorTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/regression/OpRandomForestRegressorTest.scala new file mode 100644 index 0000000000..7a9080b0f9 --- /dev/null +++ b/core/src/test/scala/com/salesforce/op/stages/impl/regression/OpRandomForestRegressorTest.scala @@ -0,0 +1,84 @@ +/* + * Copyright (c) 2017, Salesforce.com, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of Salesforce.com nor the names of its contributors may + * be used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +package com.salesforce.op.stages.impl.regression + +import com.salesforce.op.features.types._ +import com.salesforce.op.stages.impl.PredictionEquality +import com.salesforce.op.stages.sparkwrappers.specific.{OpPredictorWrapper, OpPredictorWrapperModel} +import com.salesforce.op.test._ +import org.apache.spark.ml.linalg.Vectors +import org.apache.spark.ml.regression.{RandomForestRegressionModel, RandomForestRegressor} +import org.junit.runner.RunWith +import org.scalatest.junit.JUnitRunner + +@RunWith(classOf[JUnitRunner]) +class OpRandomForestRegressorTest extends OpEstimatorSpec[Prediction, + OpPredictorWrapperModel[RandomForestRegressionModel], + OpPredictorWrapper[RandomForestRegressor, RandomForestRegressionModel]] with PredictionEquality { + + val (inputData, rawLabel, features) = TestFeatureBuilder( + Seq[(RealNN, OPVector)]( + (10.0.toRealNN, Vectors.dense(1.0, 4.3, 1.3).toOPVector), + (20.0.toRealNN, Vectors.dense(2.0, 0.3, 0.1).toOPVector), + (30.0.toRealNN, Vectors.dense(3.0, 3.9, 4.3).toOPVector), + (40.0.toRealNN, Vectors.dense(4.0, 1.3, 0.9).toOPVector), + (50.0.toRealNN, Vectors.dense(5.0, 4.7, 1.3).toOPVector) + ) + ) + val label = rawLabel.copy(isResponse = true) + val estimator = new OpRandomForestRegressor().setInput(label, features) + + val expectedResult = Seq( + Prediction(20.0), + Prediction(23.5), + Prediction(31.5), + Prediction(35.5), + Prediction(37.0) + ) + + it should "allow the user to set the desired spark parameters" in { + estimator + .setMaxDepth(7) + .setMaxBins(3) + .setMinInstancesPerNode(2) + .setMinInfoGain(0.1) + .setSeed(42L) + estimator.fit(inputData) + + estimator.predictor.getMaxDepth shouldBe 7 + estimator.predictor.getMaxBins shouldBe 3 + estimator.predictor.getMinInstancesPerNode shouldBe 2 + estimator.predictor.getMinInfoGain shouldBe 0.1 + estimator.predictor.getSeed shouldBe 42L + + } +} diff --git a/core/src/test/scala/org/apache/spark/ml/regression/OpPredictionModelTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/regression/OpRegressionModelTest.scala similarity index 80% rename from core/src/test/scala/org/apache/spark/ml/regression/OpPredictionModelTest.scala rename to core/src/test/scala/com/salesforce/op/stages/impl/regression/OpRegressionModelTest.scala index ea612d29a4..11b75b6772 100644 --- a/core/src/test/scala/org/apache/spark/ml/regression/OpPredictionModelTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/regression/OpRegressionModelTest.scala @@ -29,19 +29,20 @@ * POSSIBILITY OF SUCH DAMAGE. */ -package org.apache.spark.ml.regression +package com.salesforce.op.stages.impl.regression import com.salesforce.op.features.types.{Prediction, RealNN} +import com.salesforce.op.stages.sparkwrappers.specific.SparkModelConverter.toOP import com.salesforce.op.test._ import com.salesforce.op.testkit._ -import org.apache.spark.ml.SparkModelConverter.toOP +import org.apache.spark.ml.regression._ import org.apache.spark.sql.DataFrame import org.junit.runner.RunWith import org.scalatest.FlatSpec import org.scalatest.junit.JUnitRunner @RunWith(classOf[JUnitRunner]) -class OpPredictionModelTest extends FlatSpec with TestSparkContext { +class OpRegressionModelTest extends FlatSpec with TestSparkContext { private val label = RandomIntegral.integrals(0, 2).limit(1000) .map{ v => RealNN(v.value.map(_.toDouble).getOrElse(0.0)) } @@ -57,19 +58,19 @@ class OpPredictionModelTest extends FlatSpec with TestSparkContext { .setLabelCol(labelF.name) .fit(rawDF) - val op = toOP(Some(spk)).setInput(labelF, featureV) + val op = toOP(spk, spk.uid).setInput(labelF, featureV) compareOutputs(spk.transform(rawDF), op.transform(rawDF)) } - Spec[OpLinearPredictionModel] should "produce the same values as the spark version" in { + Spec[OpLinearRegressionModel] should "produce the same values as the spark version" in { val spk = new LinearRegression() .setFeaturesCol(featureV.name) .setLabelCol(labelF.name) .fit(rawDF) - val op = toOP(Some(spk)).setInput(labelF, featureV) + val op = toOP(spk, spk.uid).setInput(labelF, featureV) compareOutputs(spk.transform(rawDF), op.transform(rawDF)) } @@ -80,7 +81,7 @@ class OpPredictionModelTest extends FlatSpec with TestSparkContext { .setLabelCol(labelF.name) .fit(rawDF) - val op = toOP(Some(spk)).setInput(labelF, featureV) + val op = toOP(spk, spk.uid).setInput(labelF, featureV) compareOutputs(spk.transform(rawDF), op.transform(rawDF)) } @@ -91,11 +92,21 @@ class OpPredictionModelTest extends FlatSpec with TestSparkContext { .setLabelCol(labelF.name) .fit(rawDF) - val op = toOP(Some(spk)).setInput(labelF, featureV) + val op = toOP(spk, spk.uid).setInput(labelF, featureV) compareOutputs(spk.transform(rawDF), op.transform(rawDF)) } + Spec[OpGeneralizedLinearRegressionModel] should "produce the same values as the spark version" in { + val spk = new GeneralizedLinearRegression() + .setFeaturesCol(featureV.name) + .setLabelCol(labelF.name) + .fit(rawDF) + + val op = toOP(spk, spk.uid).setInput(labelF, featureV) + + compareOutputs(spk.transform(rawDF), op.transform(rawDF)) + } def compareOutputs(df1: DataFrame, df2: DataFrame): Unit = { val sorted1 = df1.collect().sortBy(_.getAs[Double](2)) @@ -106,3 +117,5 @@ class OpPredictionModelTest extends FlatSpec with TestSparkContext { } } } + + diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/tuning/DataBalancerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/tuning/DataBalancerTest.scala index 5e98c480f4..f69c7ee0c1 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/tuning/DataBalancerTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/tuning/DataBalancerTest.scala @@ -52,12 +52,12 @@ class DataBalancerTest extends FlatSpec with TestSparkContext { // Generate positive observations following a distribution ~ N((0.0, 0.0, 0.0), I_3) val positiveData = { RandomRDDs.normalVectorRDD(sc, bigCount, 3, seed = seed) - .map(v => (1.0, Vectors.dense(v.toArray), "A")).toDS() + .map(v => (1.0, Vectors.dense(v.toArray), "A")).toDF() } // Generate negative observations following a distribution ~ N((10.0, 10.0, 10.0), I_3) val negativeData = { RandomRDDs.normalVectorRDD(sc, smallCount, 3, seed = seed) - .map(v => (0.0, Vectors.dense(v.toArray.map(_ + 10.0)), "B")).toDS() + .map(v => (0.0, Vectors.dense(v.toArray.map(_ + 10.0)), "B")).toDF() } val data = positiveData.union(negativeData) @@ -77,7 +77,7 @@ class DataBalancerTest extends FlatSpec with TestSparkContext { val (downSample, upSample) = dataBalancer.getProportions(smallCount, bigCount, sampleFraction, maxTrainingSample) val reSampled = dataBalancer.rebalance(negativeData, upSample, positiveData, downSample, seed) - val Array(negData, posData) = Array(0.0, 1.0).map(label => reSampled.filter(_._1 == label).persist()) + val Array(negData, posData) = Array(0.0, 1.0).map(label => reSampled.filter(_.getDouble(0) == label).persist()) val negativeCount = negData.count() val positiveCount = posData.count() @@ -107,8 +107,13 @@ class DataBalancerTest extends FlatSpec with TestSparkContext { balancer.getDownSampleFraction shouldBe downSample balancer.getIsPositiveSmall shouldBe false + // Rerun balancer with set params + val metadata = balancer.metadataBuilder val ModelData(expected2, _) = balancer.prepare(data) + withClue("Data balancer should no update the metadata"){ + balancer.metadataBuilder shouldBe metadata + } expected.collect() shouldBe expected2.collect() } @@ -125,7 +130,11 @@ class DataBalancerTest extends FlatSpec with TestSparkContext { balancer.getAlreadyBalancedFraction shouldBe 1.0 // Rerun balancer with set params + val metadata = balancer.metadataBuilder val ModelData(expected2, _) = balancer.prepare(data) + withClue("Data balancer should no update the metadata"){ + balancer.metadataBuilder shouldBe metadata + } expected.collect() shouldBe expected2.collect() } @@ -144,7 +153,11 @@ class DataBalancerTest extends FlatSpec with TestSparkContext { balancer.getAlreadyBalancedFraction shouldBe maxSize.toDouble / (smallCount + bigCount) // Rerun balancer with set params + val metadata = balancer.metadataBuilder val ModelData(expected2, _) = balancer.prepare(data) + withClue("Data balancer should no update the metadata"){ + balancer.metadataBuilder shouldBe metadata + } expected.collect() shouldBe expected2.collect() } diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/tuning/DataCutterTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/tuning/DataCutterTest.scala index 0293b7799a..706472c37f 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/tuning/DataCutterTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/tuning/DataCutterTest.scala @@ -54,8 +54,8 @@ class DataCutterTest extends FlatSpec with TestSparkContext { val data = labels.zip(vectors).zip(labelsBiased) val dataSize = data.size - val randDF = sc.makeRDD(data.map { case ((l, v), b) => (l.toDouble.get, v.value, b.toString) }).toDS() - val biasDF = sc.makeRDD(data.map { case ((l, v), b) => (b.toDouble.get, v.value, l.toString) }).toDS() + val randDF = sc.makeRDD(data.map { case ((l, v), b) => (l.toDouble.get, v.value, b.toString) }).toDF() + val biasDF = sc.makeRDD(data.map { case ((l, v), b) => (b.toDouble.get, v.value, l.toString) }).toDF() val seed = 42L Spec[DataCutter] should "not filter out any data when the parameters are permissive" in { diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/tuning/DataSplitterTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/tuning/DataSplitterTest.scala index 2235597324..d4d1438881 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/tuning/DataSplitterTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/tuning/DataSplitterTest.scala @@ -47,7 +47,7 @@ class DataSplitterTest extends FlatSpec with TestSparkContext { val data = RandomRDDs.normalVectorRDD(sc, 1000, 3, seed = seed) - .map(v => (1.0, Vectors.dense(v.toArray), "A")).toDS() + .map(v => (1.0, Vectors.dense(v.toArray), "A")).toDF() val dataSplitter = new DataSplitter().setSeed(seed) diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/tuning/OpValidatorTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/tuning/OpValidatorTest.scala index 5665bcfde7..116f94f979 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/tuning/OpValidatorTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/tuning/OpValidatorTest.scala @@ -34,24 +34,23 @@ package com.salesforce.op.stages.impl.tuning import com.salesforce.op.evaluators.Evaluators import com.salesforce.op.features.types._ import com.salesforce.op.stages.impl.classification.ProbabilisticClassifierType.{ProbClassifier, ProbClassifierModel} -import com.salesforce.op.stages.impl.selector.ModelSelectorBaseNames -import com.salesforce.op.stages.impl.tuning.SelectorData.LabelFeaturesKey import com.salesforce.op.test.{TestFeatureBuilder, TestSparkContext} import com.salesforce.op.testkit.{RandomBinary, RandomIntegral, RandomReal, RandomVector} import org.apache.spark.rdd.RDD import org.apache.spark.sql.Row -import org.apache.spark.sql.functions.monotonically_increasing_id import org.junit.runner.RunWith import org.scalatest.FlatSpec import org.scalatest.junit.JUnitRunner import org.apache.spark.ml.linalg.Vector +import org.apache.spark.sql.types.MetadataBuilder +import com.salesforce.op.utils.spark.RichDataset._ @RunWith(classOf[JUnitRunner]) class OpValidatorTest extends FlatSpec with TestSparkContext { // Random Data val count = 1000 val sizeOfVector = 2 - val seed = 1234L + val seed = 12345L val p = 0.325 val multiClassProbabilities = Array(0.21, 0.29, 0.5) val vectors = RandomVector.sparse(RandomReal.uniform[Real](-1.0, 1.0), sizeOfVector).take(count) @@ -75,27 +74,27 @@ class OpValidatorTest extends FlatSpec with TestSparkContext { stratify = true ) - val rdd = data.withColumn(ModelSelectorBaseNames.idColName, monotonically_increasing_id()).rdd + val binaryDS = data.select(label, features) + val multiDS = data.select(multiLabel, features) - val binaryRDD = rdd.map { - case Row(label, features, _, index) => (label, features, index).asInstanceOf[LabelFeaturesKey] - } - - val multiRDD = rdd.map { - case Row(_, features, multiLabel, index) => (multiLabel, features, index).asInstanceOf[LabelFeaturesKey] - } + val condition = cv.isClassification && cv.stratify + val balancer = Option(new DataBalancer()) + val cutter = Option(new DataCutter()) Spec[OpCrossValidation[_, _]] should "stratify binary class data" in { - val splits = cv.createTrainValidationSplits(binaryRDD) + val splits = cv.createTrainValidationSplits(condition, binaryDS, label.name, balancer) + splits.length shouldBe ValidatorParamDefaults.NumFolds splits.foreach { case (train, validate) => assertFractions(Array(1 - p, p), train) assertFractions(Array(1 - p, p), validate) } + balancer.get.metadataBuilder.build() should not be new MetadataBuilder().build() } it should "stratify multi class data" in { - val splits = cv.createTrainValidationSplits(multiRDD) + val splits = cv.createTrainValidationSplits(condition, multiDS, multiLabel.name, cutter) + splits.length shouldBe ValidatorParamDefaults.NumFolds splits.foreach { case (train, validate) => assertFractions(multiClassProbabilities, train) assertFractions(multiClassProbabilities, validate) @@ -104,15 +103,16 @@ class OpValidatorTest extends FlatSpec with TestSparkContext { Spec[OpTrainValidationSplit[_, _]] should "stratify binary class data" in { - val splits = ts.createTrainValidationSplits(binaryRDD) + val splits = ts.createTrainValidationSplits(condition, binaryDS, label.name, balancer) splits.foreach { case (train, validate) => assertFractions(Array(1 - p, p), train) assertFractions(Array(1 - p, p), validate) } + balancer.get.metadataBuilder.build() should not be new MetadataBuilder().build() } it should "stratify multi class data" in { - val splits = ts.createTrainValidationSplits(multiRDD) + val splits = ts.createTrainValidationSplits(condition, multiDS, multiLabel.name, cutter) splits.foreach { case (train, validate) => assertFractions(multiClassProbabilities, train) assertFractions(multiClassProbabilities, validate) @@ -132,7 +132,8 @@ class OpValidatorTest extends FlatSpec with TestSparkContext { }.groupByKey().mapValues(_.size / n).sortBy(_._1).values.collect() fractions zip fractionsByClass map { case (expected, actual) => - math.abs(expected - actual) should be < 0.05 } + math.abs(expected - actual) should be < 0.065 + } } } diff --git a/core/src/test/scala/com/salesforce/op/stages/sparkwrappers/generic/SparkWrapperParamsTest.scala b/core/src/test/scala/com/salesforce/op/stages/sparkwrappers/generic/SparkWrapperParamsTest.scala index 659c737ba2..e85cd7f5a5 100644 --- a/core/src/test/scala/com/salesforce/op/stages/sparkwrappers/generic/SparkWrapperParamsTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/sparkwrappers/generic/SparkWrapperParamsTest.scala @@ -66,13 +66,4 @@ class SparkWrapperParamsTest extends FlatSpec with BeforeAndAfterEach with TestC swEstimator.getSparkMlStage() shouldBe None } - it should "when setting the stage it should also set path" in { - // should should be none because nothing is set - swEstimator.getStageSavePath().get shouldBe swEstimator.getSavePath() - - swEstimator.setSavePath(path) - swEstimator.setSparkMlStage(Some(new StandardScaler())) - - swEstimator.getStageSavePath().get shouldBe swEstimator.getSavePath() - } } diff --git a/core/src/test/scala/com/salesforce/op/stages/sparkwrappers/specific/OpPredictorWrapperTest.scala b/core/src/test/scala/com/salesforce/op/stages/sparkwrappers/specific/OpPredictorWrapperTest.scala index f7830d39a8..b421f255ec 100644 --- a/core/src/test/scala/com/salesforce/op/stages/sparkwrappers/specific/OpPredictorWrapperTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/sparkwrappers/specific/OpPredictorWrapperTest.scala @@ -32,6 +32,7 @@ package com.salesforce.op.stages.sparkwrappers.specific import com.salesforce.op.features.types._ +import com.salesforce.op.stages.sparkwrappers.generic.SparkWrapperParams import com.salesforce.op.test.{PrestigeData, TestFeatureBuilder, TestSparkContext} import org.apache.spark.ml.linalg.Vectors import org.apache.spark.ml.regression.{LinearRegression, LinearRegressionModel} @@ -46,11 +47,11 @@ class OpPredictorWrapperTest extends FlatSpec with TestSparkContext with Prestig val log = LoggerFactory.getLogger(this.getClass) - val (ds, targetLabel, featureVector) = TestFeatureBuilder[Real, OPVector]( - prestigeSeq.map(p => p.prestige.toReal -> Vectors.dense(p.education, p.income, p.women).toOPVector) + val (ds, targetLabel, featureVector) = TestFeatureBuilder[RealNN, OPVector]( + prestigeSeq.map(p => p.prestige.toRealNN -> Vectors.dense(p.education, p.income, p.women).toOPVector) ) - Spec[OpPredictorWrapper[_, _, _, _]] should + Spec[OpPredictorWrapper[_, _]] should "be able to run a simple logistic regression model (fitIntercept=true)" in { val lrModel: LinearRegressionModel = fitLinRegModel(fitIntercept = true) lrModel.intercept.abs should be > 1E-6 @@ -69,12 +70,11 @@ class OpPredictorWrapperTest extends FlatSpec with TestSparkContext with Prestig .setElasticNetParam(0.8) .setFitIntercept(fitIntercept) - val lr = - new OpPredictorWrapper[Real, Real, LinearRegression, LinearRegressionModel](lrBase) - .setInput(targetLabel, featureVector) + val lr = new OpPredictorWrapper[LinearRegression, LinearRegressionModel](lrBase) + .setInput(targetLabel, featureVector) // Fit the model - val model = lr.fit(ds) + val model = lr.fit(ds).asInstanceOf[SparkWrapperParams[LinearRegressionModel]] val lrModel = model.getSparkMlStage().get // Print the coefficients and intercept for linear regression diff --git a/core/src/test/scala/com/salesforce/op/stages/sparkwrappers/specific/OpProbabilisticClassifierWrapperTest.scala b/core/src/test/scala/com/salesforce/op/stages/sparkwrappers/specific/OpProbabilisticClassifierWrapperTest.scala deleted file mode 100644 index d9db98dc50..0000000000 --- a/core/src/test/scala/com/salesforce/op/stages/sparkwrappers/specific/OpProbabilisticClassifierWrapperTest.scala +++ /dev/null @@ -1,184 +0,0 @@ -/* - * Copyright (c) 2017, Salesforce.com, Inc. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - * - * 3. Neither the name of Salesforce.com nor the names of its contributors may - * be used to endorse or promote products derived from this software without - * specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ - -package com.salesforce.op.stages.sparkwrappers.specific - -import com.salesforce.op.features.FeatureSparkTypes -import com.salesforce.op.features.types._ -import com.salesforce.op.test.{TestFeatureBuilder, TestSparkContext} -import org.apache.spark.ml.classification.{DecisionTreeClassificationModel, DecisionTreeClassifier, LogisticRegression, LogisticRegressionModel} -import org.apache.spark.ml.linalg.Vectors -import org.apache.spark.sql.types.{DoubleType, StructField, StructType} -import org.junit.runner.RunWith -import org.scalatest.FlatSpec -import org.scalatest.junit.JUnitRunner - -@RunWith(classOf[JUnitRunner]) -class OpProbabilisticClassifierWrapperTest extends FlatSpec with TestSparkContext { - - val (testData, targetLabel, featureVector) = TestFeatureBuilder("label", "features", - Seq[(RealNN, OPVector)]( - 1.0.toRealNN -> Vectors.dense(12.0, 4.3, 1.3).toOPVector, - 0.0.toRealNN -> Vectors.dense(0.0, 0.3, 0.1).toOPVector, - 0.0.toRealNN -> Vectors.dense(1.0, 3.9, 4.3).toOPVector, - 1.0.toRealNN -> Vectors.dense(10.0, 1.3, 0.9).toOPVector, - 1.0.toRealNN -> Vectors.dense(15.0, 4.7, 1.3).toOPVector, - 0.0.toRealNN -> Vectors.dense(0.5, 0.9, 10.1).toOPVector, - 1.0.toRealNN -> Vectors.dense(11.5, 2.3, 1.3).toOPVector, - 0.0.toRealNN -> Vectors.dense(0.1, 3.3, 0.1).toOPVector, - 0.0.toRealNN -> Vectors.dense(12.0, 3.3, -0.1).toOPVector - ) - ) - - Spec[OpProbabilisticClassifierWrapper[_, _]] should "have the correct params set (fitIntercept = true)" in { - val lrClassifierModel: LogisticRegressionModel = fitLrModel(fitInterceptParam = true) - lrClassifierModel.intercept.abs should be > 1E-6 - } - - it should "have the correct params set (logreg with fitIntercept = false)" in { - val lrClassifierModel: LogisticRegressionModel = fitLrModel(fitInterceptParam = false) - lrClassifierModel.intercept.abs should be < Double.MinPositiveValue - } - - it should "should have the expected feature name (decision tree)" in { - val wrappedEstimator = - new OpProbabilisticClassifierWrapper[DecisionTreeClassifier, DecisionTreeClassificationModel]( - new DecisionTreeClassifier() - ).setInput(targetLabel, featureVector) - - val (out1, out2, out3) = wrappedEstimator.getOutput() - - out1.name shouldBe wrappedEstimator.stage1.getOutput().name - out2.name shouldBe wrappedEstimator.stage2.getOutput().name - out3.name shouldBe wrappedEstimator.stage3.getOutput().name - } - - it should "have the correct params set (decision tree with maxDepth = 1)" in { - val depth = 1 - val dtClassifierModel: DecisionTreeClassificationModel = fitDtModel(depth) - assert(dtClassifierModel.toDebugString.contains(s"depth $depth")) - } - - it should "have the correct params set (decision tree with maxDepth = 2)" in { - val depth = 2 - val dtClassifierModel: DecisionTreeClassificationModel = fitDtModel(depth) - assert(dtClassifierModel.toDebugString.contains(s"depth $depth")) - } - - it should "ignore values set for input and output cols outside the OP wrapper" in { - // configure input classifier and set input col names outside of OP wrapper - val customLabelColName = "indexedLabel" - val customFeaturesColName = "indexedFeatures" - val customProbCol = "xxx" - val customPredCol = "yyy" - val customRawCol = "zzz" - val dtClassifier = new DecisionTreeClassifier() - - dtClassifier.setLabelCol(customLabelColName).setFeaturesCol(customFeaturesColName) - dtClassifier.setPredictionCol(customPredCol).setProbabilityCol(customProbCol).setRawPredictionCol(customRawCol) - - val dtEstimator = - new OpProbabilisticClassifierWrapper[DecisionTreeClassifier, DecisionTreeClassificationModel](dtClassifier) - .setInput(targetLabel, featureVector) - - // verify that the colnames configured outside the opwrapper where ignored and are what is expected - val inputNames = dtEstimator.stage1.getInputFeatures().map(_.name) - inputNames should have length 2 - inputNames shouldBe Array(targetLabel.name, featureVector.name) - dtClassifier.setLabelCol(customLabelColName).setFeaturesCol(customFeaturesColName) - dtClassifier.setPredictionCol(customPredCol).setProbabilityCol(customProbCol).setRawPredictionCol(customRawCol) - - val model = dtEstimator.fit(testData) - - dtEstimator.uid shouldBe model.uid - - dtClassifier.setLabelCol(customLabelColName).setFeaturesCol(customFeaturesColName) - dtClassifier.setPredictionCol(customPredCol).setProbabilityCol(customProbCol).setRawPredictionCol(customRawCol) - - val (out1, out2, out3) = model.getOutput() - val output = model.transform(testData) - - output.schema shouldBe StructType(Array( - StructField(targetLabel.name, DoubleType, true), - StructField(featureVector.name, FeatureSparkTypes.sparkTypeOf[OPVector], true), - StructField(out2.name, FeatureSparkTypes.sparkTypeOf[OPVector], true), - StructField(out3.name, FeatureSparkTypes.sparkTypeOf[OPVector], true), - StructField(out1.name, DoubleType, true) - )) - } - - def fitDtModel(depth: Int): DecisionTreeClassificationModel = { - val dtClassifier = new DecisionTreeClassifier().setMaxDepth(depth) - - val dtEstimator = new OpProbabilisticClassifierWrapper[DecisionTreeClassifier, DecisionTreeClassificationModel]( - dtClassifier - ).setInput(targetLabel, featureVector) - - val model = dtEstimator.fit(testData) - val output = model.transform(testData) - - val dtClassifierModel = model.stage1.getSparkMlStage().get - dtClassifierModel - } - - def fitLrModel(fitInterceptParam: Boolean): LogisticRegressionModel = { - val regParam = 0.3 - val elasticNetParam = 0.8 - val maxIterParam = 100 - val tolParam = 1E-6 - - val lrClassifier = new LogisticRegression() - .setRegParam(regParam) - .setElasticNetParam(elasticNetParam) - .setMaxIter(maxIterParam) - .setTol(tolParam) - .setFitIntercept(fitInterceptParam) - - val testEstimator = new OpProbabilisticClassifierWrapper[LogisticRegression, LogisticRegressionModel]( - lrClassifier - ).setInput(targetLabel, featureVector) - - val model = testEstimator.fit(testData) - val output = model.transform(testData) - - val lrClassifierModel = model.stage1.getSparkMlStage().get - - lrClassifierModel.getRegParam shouldBe regParam - lrClassifierModel.getElasticNetParam shouldBe elasticNetParam - lrClassifierModel.getMaxIter shouldBe maxIterParam - lrClassifierModel.getTol shouldBe tolParam - lrClassifierModel.getFitIntercept shouldBe fitInterceptParam - - lrClassifierModel - } -} - - diff --git a/core/src/test/scala/com/salesforce/op/utils/text/OpenNLPNameEntityTaggerTest.scala b/core/src/test/scala/com/salesforce/op/utils/text/OpenNLPNameEntityTaggerTest.scala new file mode 100644 index 0000000000..bd94c6cfda --- /dev/null +++ b/core/src/test/scala/com/salesforce/op/utils/text/OpenNLPNameEntityTaggerTest.scala @@ -0,0 +1,118 @@ +/* + * Copyright (c) 2017, Salesforce.com, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of Salesforce.com nor the names of its contributors may + * be used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +package com.salesforce.op.utils.text + +import com.salesforce.op.features.types._ +import com.salesforce.op.stages.impl.feature.NameEntityRecognizer +import com.salesforce.op.test.TestCommon +import com.salesforce.op.utils.text.NameEntityType._ +import opennlp.tools.util.Span +import org.junit.runner.RunWith +import org.scalatest._ +import org.scalatest.junit.JUnitRunner + +@RunWith(classOf[JUnitRunner]) +class OpenNLPNameEntityTaggerTest extends FlatSpec with TestCommon { + + val nerTagger = new OpenNLPNameEntityTagger() + + Spec[OpenNLPNameEntityTagger] should "return the consistent results as expected" in { + val input = Seq( + "Pierre Vinken, 61 years old, will join the board as a nonexecutive director Nov. 29.", + "Rudolph Agnew, 55 years old and former chairman of Consolidated Gold Fields PLC, was named a director of this" + + "a director of this British industrial conglomerate." + ) + val tokens: Seq[TextList] = input.map(x => NameEntityRecognizer.Analyzer.analyze(x, Language.English).toTextList) + val expectedOutputs = Seq( + Map("Vinken" -> Set(Person), "Pierre" -> Set(Person)), + Map("Agnew" -> Set(Person), "Rudolph" -> Set(Person)) + ) + tokens.zip(expectedOutputs).foreach { case (tokenInput, expected) => + nerTagger.tag(tokenInput.value, Language.English, Seq(NameEntityType.Person)).tokenTags shouldEqual expected + } + } + + it should "load all the existing name entity recognition models" in { + val languageNameEntityPairs = Seq( + (Language.English, NameEntityType.Date), + (Language.English, NameEntityType.Location), + (Language.English, NameEntityType.Money), + (Language.English, NameEntityType.Organization), + (Language.English, NameEntityType.Percentage), + (Language.English, NameEntityType.Person), + (Language.English, NameEntityType.Time), + (Language.Spanish, NameEntityType.Location), + (Language.Spanish, NameEntityType.Organization), + (Language.Spanish, NameEntityType.Person), + (Language.Spanish, NameEntityType.Misc), + (Language.Dutch, NameEntityType.Location), + (Language.Dutch, NameEntityType.Organization), + (Language.Dutch, NameEntityType.Person), + (Language.Dutch, NameEntityType.Misc) + ) + languageNameEntityPairs.foreach { case (l, n) => + OpenNLPModels.getTokenNameFinderModel(l, n).isDefined shouldBe true + } + } + + it should "not get any model correctly if no such model exists" in { + val languageNameEntityPairs = Seq( + (Language.Unknown, NameEntityType.Other), + (Language.Urdu, NameEntityType.Location) + ) + languageNameEntityPairs.foreach { case (l, n) => + OpenNLPModels.getTokenNameFinderModel(l, n) shouldBe None + } + } + + // test the convertSpansToMap function + it should "retrieve correct information from the output of name entity recognition model" in { + val inputs = Seq(Array("ab", "xx", "yy", "zz", "ss", "dd", "cc") -> + Seq(new Span(2, 4, "person"), new Span(3, 5, "location")), // interweaving entities + Array("a", "b", "c", "d") -> Seq(new Span(3, 4, "location")), // end of sentence entity + Array("a", "b", "c", "d") -> Seq(new Span(0, 2, "location")), // beginning of sentence entity + Array("a", "b", "c", "d") -> Seq.empty + ) + val expectedOutputs = Seq( + Map("yy" -> Set(Person), "zz" -> Set(Person, Location), "ss" -> Set(Location)), + Map("d" -> Set(Location)), + Map("a" -> Set(Location), "b" -> Set(Location)), + Map.empty[String, Set[String]] + ) + + inputs.zip(expectedOutputs).map { case (tokensInput, expected) => + val actual = nerTagger.convertSpansToMap(tokensInput._2, tokensInput._1) + actual shouldEqual expected + } + } + +} diff --git a/core/src/test/scala/com/salesforce/op/utils/text/OpenNLPSentenceSplitterTest.scala b/core/src/test/scala/com/salesforce/op/utils/text/OpenNLPSentenceSplitterTest.scala new file mode 100644 index 0000000000..14c6a5bbc9 --- /dev/null +++ b/core/src/test/scala/com/salesforce/op/utils/text/OpenNLPSentenceSplitterTest.scala @@ -0,0 +1,123 @@ +/* + * Copyright (c) 2017, Salesforce.com, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of Salesforce.com nor the names of its contributors may + * be used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +package com.salesforce.op.utils.text + +import com.salesforce.op.features.types._ +import com.salesforce.op.stages.impl.feature.TextTokenizer +import com.salesforce.op.stages.impl.feature.TextTokenizer.TextTokenizerResult +import com.salesforce.op.test.TestCommon +import com.salesforce.op.utils.text.Language._ +import opennlp.tools.sentdetect.SentenceModel +import opennlp.tools.tokenize.TokenizerModel +import org.junit.runner.RunWith +import org.scalatest.FlatSpec +import org.scalatest.junit.JUnitRunner + +@RunWith(classOf[JUnitRunner]) +class OpenNLPSentenceSplitterTest extends FlatSpec with TestCommon { + + val splitter = new OpenNLPSentenceSplitter() + + Spec[OpenNLPSentenceSplitter] should "split an English paragraph into sentences" in { + val input = + "Pierre Vinken, 61 years old, will join the board as a nonexecutive director Nov 29. " + + "Mr Vinken is chairman of Elsevier N.V., the Dutch publishing group. Rudolph Agnew, 55 years old and " + + "former chairman of Consolidated Gold Fields PLC, was named a director of this British industrial conglomerate." + + splitter.getSentences(input, language = English) shouldEqual Seq( + "Pierre Vinken, 61 years old, will join the board as a nonexecutive director Nov 29.", + "Mr Vinken is chairman of Elsevier N.V., the Dutch publishing group.", + "Rudolph Agnew, 55 years old and former chairman of Consolidated Gold Fields PLC, " + + "was named a director of this British industrial conglomerate." + ) + + TextTokenizer.tokenize(input.toText, sentenceSplitter = Option(splitter), defaultLanguage = English) shouldEqual + TextTokenizerResult(English, Seq( + Seq("pierr", "vinken", "61", "year", "old", "will", "join", "board", + "nonexecut", "director", "nov", "29").toTextList, + Seq("mr", "vinken", "chairman", "elsevi", "n.v", "dutch", "publish", "group").toTextList, + Seq("rudolph", "agnew", "55", "year", "old", "former", "chairman", "consolid", "gold", "field", "plc", + "name", "director", "british", "industri", "conglomer").toTextList)) + + TextTokenizer.tokenize(input.toText, analyzer = new OpenNLPAnalyzer(), sentenceSplitter = Option(splitter), + defaultLanguage = English) shouldEqual TextTokenizerResult( + English, Seq( + Seq("pierre", "vinken", ",", "61", "years", "old", ",", "will", "join", "the", "board", "as", "a", + "nonexecutive", "director", "nov", "29", ".").toTextList, + Seq("mr", "vinken", "is", "chairman", "of", "elsevier", "n", ".v.", ",", "the", "dutch", "publishing", + "group", ".").toTextList, + Seq("rudolph", "agnew", ",", "55", "years", "old", "and", "former", "chairman", "of", "consolidated", + "gold", "fields", "plc", ",", "was", "named", "a", "director", "of", "this", "british", "industrial", + "conglomerate", ".").toTextList)) + } + + it should "split a Portuguese text into sentences" in { + // scalastyle:off + val input = "Depois de Guimarães, o North Music Festival estaciona este ano no Porto. A partir de sexta-feira, " + + "a Alfândega do Porto recebe a segunda edição deste festival de dois dias. No cartaz há nomes como os " + + "portugueses Linda Martini e Mão Morta, mas também Guano Apes ou os DJ’s portugueses Rich e Mendes." + + splitter.getSentences(input, language = Portuguese) shouldEqual Seq( + "Depois de Guimarães, o North Music Festival estaciona este ano no Porto.", + "A partir de sexta-feira, a Alfândega do Porto recebe a segunda edição deste festival de dois dias.", + "No cartaz há nomes como os portugueses Linda Martini e Mão Morta, mas também Guano Apes ou os DJ’s " + + "portugueses Rich e Mendes." + ) + // scalastyle:on + } + + it should "load a sentence detection and tokenizer model for a language if they exist" in { + val languages = Seq(Danish, Portuguese, English, Dutch, German, Sami) + languages.map { language => + OpenNLPModels.getSentenceModel(language).exists(_.isInstanceOf[SentenceModel]) shouldBe true + OpenNLPModels.getTokenizerModel(language).exists(_.isInstanceOf[TokenizerModel]) shouldBe true + } + } + + it should "load not a sentence detection and tokenizer model for a language if they do not exist" in { + val languages = Seq(Japanese, Czech) + languages.map { language => + OpenNLPModels.getSentenceModel(language) shouldEqual None + OpenNLPModels.getTokenizerModel(language) shouldEqual None + } + } + + it should "return non-preprocessed input if no such a sentence detection model exist" in { + // scalastyle:off + val input = "ピエール・ヴィンケン(61歳)は、11月29日に臨時理事に就任します。" + + "ヴィンケン氏は、オランダの出版グループであるエルゼビアN.V.の会長です。 " + + "55歳のルドルフ・アグニュー(Rudolph Agnew、元コネチカットゴールドフィールドPLC)会長は、" + + "この英国の産業大企業の取締役に任命されました。" + // scalastyle:on + splitter.getSentences(input, language = Language.Japanese) shouldEqual Seq(input) + } +} diff --git a/features/src/main/scala/com/salesforce/op/features/TransientFeature.scala b/features/src/main/scala/com/salesforce/op/features/TransientFeature.scala index b1263b898e..f3c581dd43 100644 --- a/features/src/main/scala/com/salesforce/op/features/TransientFeature.scala +++ b/features/src/main/scala/com/salesforce/op/features/TransientFeature.scala @@ -110,7 +110,6 @@ class TransientFeature */ def asFeatureLike[I <: FeatureType]: FeatureLike[I] = getFeature().asInstanceOf[FeatureLike[I]] - /** * Transform trasient feature into column metadata for use vectors * (for when each feature creates one column of a vector) @@ -167,6 +166,27 @@ class TransientFeature val json = render(toJson) if (pretty) JsonMethods.pretty(json) else compact(json) } + + /** + * Tests the equality of the TransientFeature objects + */ + override def equals(that: Any): Boolean = { + that match { + case t: TransientFeature => + t.name == name && t.isResponse == isResponse && t.isRaw == isRaw && + t.uid == uid && t.typeName == typeName && t.originFeatures == originFeatures && + t.stages == stages + case _ => false + } + } + + /** + * Returns the hash code of this feature + * + * @return hash code + */ + override def hashCode(): Int = uid.hashCode + } object TransientFeature { diff --git a/features/src/main/scala/com/salesforce/op/features/types/FeatureType.scala b/features/src/main/scala/com/salesforce/op/features/types/FeatureType.scala index e060e305b5..5a81578ddd 100644 --- a/features/src/main/scala/com/salesforce/op/features/types/FeatureType.scala +++ b/features/src/main/scala/com/salesforce/op/features/types/FeatureType.scala @@ -176,7 +176,6 @@ object SomeValue { */ object FeatureType { - /** * Returns feature type name * @@ -225,7 +224,8 @@ object FeatureType { * @param t type tag * @return true if this type tag corresponds to one of the feature value types, false otherwise */ - def isFeatureValueType(t: TypeTag[_]): Boolean = FeatureType.featureValueTypeTags.contains(t.tpe.dealias.toString) + def isFeatureValueType(t: TypeTag[_]): Boolean = + FeatureType.featureValueTypeTags.contains(ReflectionUtils.dealisedTypeName(t.tpe)) /** * Feature type tag @@ -351,7 +351,7 @@ object FeatureType { // Text typeTag[Option[String]] ) - typeTags.map(tag => tag.tpe.dealias.toString -> tag).toMap + typeTags.map(tag => ReflectionUtils.dealisedTypeName(tag.tpe) -> tag).toMap } } diff --git a/features/src/main/scala/com/salesforce/op/features/types/FeatureTypeDefaults.scala b/features/src/main/scala/com/salesforce/op/features/types/FeatureTypeDefaults.scala index eec88c2b5a..4816d38fd6 100644 --- a/features/src/main/scala/com/salesforce/op/features/types/FeatureTypeDefaults.scala +++ b/features/src/main/scala/com/salesforce/op/features/types/FeatureTypeDefaults.scala @@ -37,7 +37,7 @@ import org.apache.spark.ml.linalg.Vectors import scala.reflect.runtime.universe._ /** - * Default Feature Type values + * Default values for Feature Types */ case object FeatureTypeDefaults { diff --git a/features/src/main/scala/com/salesforce/op/features/types/FeatureTypeFactory.scala b/features/src/main/scala/com/salesforce/op/features/types/FeatureTypeFactory.scala index ab585b8c6e..1cdf9e1b91 100644 --- a/features/src/main/scala/com/salesforce/op/features/types/FeatureTypeFactory.scala +++ b/features/src/main/scala/com/salesforce/op/features/types/FeatureTypeFactory.scala @@ -36,7 +36,7 @@ import org.apache.spark.ml.linalg.Vector import scala.reflect.runtime.universe._ /** - * Factory for creating feature type instances + * Factory for creating Feature Type instances * * @tparam T feature type */ @@ -55,7 +55,7 @@ sealed trait FeatureTypeFactory[T <: FeatureType] extends Serializable { } /** - * Factory for creating feature type instances from primitive values + * Factory for creating Feature Type instances from primitive values */ case object FeatureTypeFactory { /** diff --git a/features/src/main/scala/com/salesforce/op/features/types/FeatureTypeSparkConverter.scala b/features/src/main/scala/com/salesforce/op/features/types/FeatureTypeSparkConverter.scala index 9101b288ec..3dc88e9aeb 100644 --- a/features/src/main/scala/com/salesforce/op/features/types/FeatureTypeSparkConverter.scala +++ b/features/src/main/scala/com/salesforce/op/features/types/FeatureTypeSparkConverter.scala @@ -38,7 +38,7 @@ import scala.reflect.runtime.universe._ /** - * Feature type from/to Spark primitives converter, i.e Real from/to Double etc. + * Feature Type from/to Spark primitives converter, i.e Real from/to Double etc. * * @tparam T feature type */ @@ -83,6 +83,30 @@ case object FeatureTypeSparkConverter { def fromSpark(value: Any): T = maker(value) } + /** + * For a given feature type class (or [[FeatureType.typeName]]) from/to Spark primitives converter, + * i.e Real from/to Double etc. + * + * @param featureTypeName full class name of the feature type, see [[FeatureType.typeName]] + * @throws IllegalArgumentException if feature type name is unknown + * @return feature type from/to Spark primitives converter + */ + def fromFeatureTypeName(featureTypeName: String): FeatureTypeSparkConverter[_ <: FeatureType] = { + featureTypeSparkConverters.get(featureTypeName) match { + case Some(converter) => converter + case None => throw new IllegalArgumentException(s"Unknown feature type '$featureTypeName'") + } + } + + /** + * A map from feature type class to [[FeatureTypeSparkConverter]] + */ + private[types] val featureTypeSparkConverters: Map[String, FeatureTypeSparkConverter[_ <: FeatureType]] = + FeatureType.featureTypeTags.map { + case (featureTypeClass, featureTypeTag) => + featureTypeClass.getName -> + FeatureTypeSparkConverter[FeatureType]()(featureTypeTag.asInstanceOf[WeakTypeTag[FeatureType]]) + } /** * Converts feature type into a Spark primitive value diff --git a/features/src/main/scala/com/salesforce/op/features/types/Geolocation.scala b/features/src/main/scala/com/salesforce/op/features/types/Geolocation.scala index 60639cd604..c86d643a0b 100644 --- a/features/src/main/scala/com/salesforce/op/features/types/Geolocation.scala +++ b/features/src/main/scala/com/salesforce/op/features/types/Geolocation.scala @@ -40,7 +40,8 @@ import Geolocation._ import scala.util.Try /** - * Represented as a list of latitude, longitude, accuracy (only populated if all are present) + * Represented as a list of latitude, longitude, accuracy + * The value is only populated if all are present, otherwise [[IllegalArgumentException]] is thrown. * * @param value a list of latitude, longitude, accuracy */ @@ -52,13 +53,33 @@ class Geolocation(val value: Seq[Double]) extends OPList[Double] with Location { } def this(lat: Double, lon: Double, accuracy: GeolocationAccuracy) = this(geolocationData(lat, lon, accuracy)) def this(v: (Double, Double, Double)) = this(geolocationData(v._1, v._2, v._3)) + /** + * Latitude value + */ def lat: Double = if (isEmpty) Double.NaN else value(0) + /** + * Longitude value + */ def lon: Double = if (isEmpty) Double.NaN else value(1) + /** + * Latitude value + */ def latitude: Double = lat + /** + * Longitude value + */ def longitude: Double = lon + + /** + * Geolocation accuracy value [[GeolocationAccuracy]] + */ def accuracy: GeolocationAccuracy = { if (isEmpty) GeolocationAccuracy.Unknown else GeolocationAccuracy.withValue(value(2).toInt) } + + /** + * Convert to [[GeoPoint]] value + */ def toGeoPoint: GeoPoint = { // If this Geolocation object is empty, then return the zero vector as the GeoPoint since we use // GeoPoint coordinates in aggregation functions @@ -110,14 +131,17 @@ sealed abstract class GeolocationAccuracy ( val value: Int, val name: String, - val rangeInMiles: Double) extends IntEnumEntry { - lazy val rangeInUnits: Double = rangeInMiles / EarthRadius + val rangeInMiles: Double +) extends IntEnumEntry { + /** + * Range in units of Earth Radius + */ + def rangeInUnits: Double = rangeInMiles / EarthRadius } case object GeolocationAccuracy extends IntEnum[GeolocationAccuracy] { val values: List[GeolocationAccuracy] = findValues.toList sortBy(_.rangeInMiles) - def geoUnitsToMiles(u: Double): Double = u * EarthRadius // No match for the address was found case object Unknown extends GeolocationAccuracy(0, name = "Unknown", rangeInMiles = EquatorInMiles / 2) @@ -142,14 +166,40 @@ case object GeolocationAccuracy extends IntEnum[GeolocationAccuracy] { // Center of the state case object State extends GeolocationAccuracy(10, name = "State", rangeInMiles = 150.0) + /** + * Convert units of Earth Radius into miles + * + * @param u units of Earth Radius + * @return miles + */ + def geoUnitsToMiles(u: Double): Double = u * EarthRadius + + /** + * Construct accuracy value for a given range in miles + * + * @param miles range in miles + * @return accuracy + */ def forRangeInMiles(miles: Double): GeolocationAccuracy = { val result = values.dropWhile(_.rangeInMiles < miles * 0.99).headOption getOrElse Unknown result } + /** + * Construct accuracy value for a given range in units of Earth Radius + * + * @param units units of Earth Radius + * @return accuracy + */ def forRangeInUnits(units: Double): GeolocationAccuracy = forRangeInMiles(geoUnitsToMiles(units)) + /** + * Find the worst accuracy value + * + * @param accuracies list of accuracies + * @return worst accuracy + */ def worst(accuracies: GeolocationAccuracy*): GeolocationAccuracy = { forRangeInMiles((Unknown :: accuracies.toList) map (_.rangeInMiles) max) } diff --git a/features/src/main/scala/com/salesforce/op/features/types/Lists.scala b/features/src/main/scala/com/salesforce/op/features/types/Lists.scala index 496c86d45e..cf16a0deb8 100644 --- a/features/src/main/scala/com/salesforce/op/features/types/Lists.scala +++ b/features/src/main/scala/com/salesforce/op/features/types/Lists.scala @@ -31,6 +31,11 @@ package com.salesforce.op.features.types +/** + * A list of text values + * + * @param value list of text values + */ class TextList(val value: Seq[String]) extends OPList[String] { def this(v: String*)(implicit d: DummyImplicit) = this(v) } @@ -39,6 +44,11 @@ object TextList { def empty: TextList = FeatureTypeDefaults.TextList } +/** + * A list of date values + * + * @param value list of date values (values assumed to be in ms since Epoch) + */ class DateList(val value: Seq[Long]) extends OPList[Long] { def this(v: Long*)(implicit d: DummyImplicit) = this(v) } @@ -47,6 +57,11 @@ object DateList { def empty: DateList = FeatureTypeDefaults.DateList } +/** + * A list of date & time values + * + * @param value list of date & time values (values assumed to be in ms since Epoch) + */ class DateTimeList(value: Seq[Long]) extends DateList(value) { def this(v: Long*)(implicit d: DummyImplicit) = this(v) } diff --git a/features/src/main/scala/com/salesforce/op/features/types/Maps.scala b/features/src/main/scala/com/salesforce/op/features/types/Maps.scala index 89951d2eed..e27e6d0f81 100644 --- a/features/src/main/scala/com/salesforce/op/features/types/Maps.scala +++ b/features/src/main/scala/com/salesforce/op/features/types/Maps.scala @@ -33,144 +33,267 @@ package com.salesforce.op.features.types import org.apache.spark.ml.linalg.Vector - +/** + * Map of text values + * + * @param value map of text values + */ class TextMap(val value: Map[String, String]) extends OPMap[String] object TextMap { def apply(value: Map[String, String]): TextMap = new TextMap(value) def empty: TextMap = FeatureTypeDefaults.TextMap } +/** + * Map of email values + * + * @param value map of email values + */ class EmailMap(val value: Map[String, String]) extends OPMap[String] object EmailMap { def apply(value: Map[String, String]): EmailMap = new EmailMap(value) def empty: EmailMap = FeatureTypeDefaults.EmailMap } +/** + * Map of base64 binary encoded values + * + * @param value map of base64 binary encoded values + */ class Base64Map(val value: Map[String, String]) extends OPMap[String] object Base64Map { def apply(value: Map[String, String]): Base64Map = new Base64Map(value) def empty: Base64Map = FeatureTypeDefaults.Base64Map } +/** + * Map of phone values + * + * @param value map of phone values + */ class PhoneMap(val value: Map[String, String]) extends OPMap[String] object PhoneMap { def apply(value: Map[String, String]): PhoneMap = new PhoneMap(value) def empty: PhoneMap = FeatureTypeDefaults.PhoneMap } +/** + * Map of ID values + * + * @param value map of ID values + */ class IDMap(val value: Map[String, String]) extends OPMap[String] object IDMap { def apply(value: Map[String, String]): IDMap = new IDMap(value) def empty: IDMap = FeatureTypeDefaults.IDMap } +/** + * Map of URL values + * + * @param value map of URL values + */ class URLMap(val value: Map[String, String]) extends OPMap[String] object URLMap { def apply(value: Map[String, String]): URLMap = new URLMap(value) def empty: URLMap = FeatureTypeDefaults.URLMap } +/** + * Map of text area values + * + * @param value map of text area values + */ class TextAreaMap(val value: Map[String, String]) extends OPMap[String] object TextAreaMap { def apply(value: Map[String, String]): TextAreaMap = new TextAreaMap(value) def empty: TextAreaMap = FeatureTypeDefaults.TextAreaMap } +/** + * Map of picklist values + * + * @param value map of picklist values + */ class PickListMap(val value: Map[String, String]) extends OPMap[String] object PickListMap { def apply(value: Map[String, String]): PickListMap = new PickListMap(value) def empty: PickListMap = FeatureTypeDefaults.PickListMap } +/** + * Map of combobox values + * + * @param value map of combobox values + */ class ComboBoxMap(val value: Map[String, String]) extends OPMap[String] object ComboBoxMap { def apply(value: Map[String, String]): ComboBoxMap = new ComboBoxMap(value) def empty: ComboBoxMap = FeatureTypeDefaults.ComboBoxMap } +/** + * Map of binary values + * + * @param value map of binary values + */ class BinaryMap(val value: Map[String, Boolean]) extends OPMap[Boolean] object BinaryMap { def apply(value: Map[String, Boolean]): BinaryMap = new BinaryMap(value) def empty: BinaryMap = FeatureTypeDefaults.BinaryMap } +/** + * Map of integral values + * + * @param value map of integral values + */ class IntegralMap(val value: Map[String, Long]) extends OPMap[Long] object IntegralMap { def apply(value: Map[String, Long]): IntegralMap = new IntegralMap(value) def empty: IntegralMap = FeatureTypeDefaults.IntegralMap } +/** + * Map of real values + * + * @param value map of real values + */ class RealMap(val value: Map[String, Double]) extends OPMap[Double] object RealMap { def apply(value: Map[String, Double]): RealMap = new RealMap(value) def empty: RealMap = FeatureTypeDefaults.RealMap } +/** + * Map of percent values + * + * @param value map of percent values + */ class PercentMap(val value: Map[String, Double]) extends OPMap[Double] object PercentMap { def apply(value: Map[String, Double]): PercentMap = new PercentMap(value) def empty: PercentMap = FeatureTypeDefaults.PercentMap } +/** + * Map of currency values + * + * @param value map of currency values + */ class CurrencyMap(val value: Map[String, Double]) extends OPMap[Double] object CurrencyMap { def apply(value: Map[String, Double]): CurrencyMap = new CurrencyMap(value) def empty: CurrencyMap = FeatureTypeDefaults.CurrencyMap } +/** + * Map of date values + * + * @param value map of date values + */ class DateMap(val value: Map[String, Long]) extends OPMap[Long] object DateMap { def apply(value: Map[String, Long]): DateMap = new DateMap(value) def empty: DateMap = FeatureTypeDefaults.DateMap } + +/** + * Map of date & time values + * + * @param value map of date & time values + */ class DateTimeMap(val value: Map[String, Long]) extends OPMap[Long] object DateTimeMap { def apply(value: Map[String, Long]): DateTimeMap = new DateTimeMap(value) def empty: DateTimeMap = FeatureTypeDefaults.DateTimeMap } +/** + * Map of multi picklist values + * + * @param value map of multi picklist values + */ class MultiPickListMap(val value: Map[String, Set[String]]) extends OPMap[Set[String]] object MultiPickListMap { def apply(value: Map[String, Set[String]]): MultiPickListMap = new MultiPickListMap(value) def empty: MultiPickListMap = FeatureTypeDefaults.MultiPickListMap } +/** + * Map of country values + * + * @param value map of country values + */ class CountryMap(val value: Map[String, String]) extends OPMap[String] with Location object CountryMap { def apply(value: Map[String, String]): CountryMap = new CountryMap(value) def empty: CountryMap = FeatureTypeDefaults.CountryMap } +/** + * Map of state values + * + * @param value map of state values + */ class StateMap(val value: Map[String, String]) extends OPMap[String] with Location object StateMap { def apply(value: Map[String, String]): StateMap = new StateMap(value) def empty: StateMap = FeatureTypeDefaults.StateMap } +/** + * Map of city values + * + * @param value map of city values + */ class CityMap(val value: Map[String, String]) extends OPMap[String] with Location object CityMap { def apply(value: Map[String, String]): CityMap = new CityMap(value) def empty: CityMap = FeatureTypeDefaults.CityMap } +/** + * Map of postal code values + * + * @param value map of postal code values + */ class PostalCodeMap(val value: Map[String, String]) extends OPMap[String] with Location object PostalCodeMap { def apply(value: Map[String, String]): PostalCodeMap = new PostalCodeMap(value) def empty: PostalCodeMap = FeatureTypeDefaults.PostalCodeMap } +/** + * Map of street values + * + * @param value map of street values + */ class StreetMap(val value: Map[String, String]) extends OPMap[String] with Location object StreetMap { def apply(value: Map[String, String]): StreetMap = new StreetMap(value) def empty: StreetMap = FeatureTypeDefaults.StreetMap } +/** + * Map of geolocation values + * + * @param value map of geolocation values + */ class GeolocationMap(val value: Map[String, Seq[Double]]) extends OPMap[Seq[Double]] with Location object GeolocationMap { def apply(value: Map[String, Seq[Double]]): GeolocationMap = new GeolocationMap(value) def empty: GeolocationMap = FeatureTypeDefaults.GeolocationMap } +/** + * Prediction representation - a map containing prediction, and optional raw prediction and probability values. + * + * This value can only be constructed from a non empty map containing a prediction, + * and optional raw prediction and probability values, otherwise [[NonNullableEmptyException]] is thrown. + * + * @param value map containing prediction, and optional raw prediction and probability values. + */ class Prediction private[op](value: Map[String, Double]) extends RealMap(value) with NonNullable { import Prediction.Keys._ @@ -190,9 +313,27 @@ class Prediction private[op](value: Map[String, Double]) extends RealMap(value) s"starting with '$RawPredictionName' or '$ProbabilityName'" ) private def keysStartsWith(name: String): Array[String] = value.keys.filter(_.startsWith(name)).toArray.sorted + + /** + * Prediction value + */ def prediction: Double = value(PredictionName) + + /** + * Raw prediction values + */ def rawPrediction: Array[Double] = keysStartsWith(RawPredictionName).map(value) + + /** + * Probability values + */ def probability: Array[Double] = keysStartsWith(ProbabilityName).map(value) + + /** + * Score values (based of probability or prediction) + * + * @return prediction values or prediction + */ def score: Array[Double] = { val probKeys = keysStartsWith(ProbabilityName) if (probKeys.nonEmpty) probKeys.map(value) else Array(value(PredictionName)) @@ -206,11 +347,59 @@ object Prediction { } import Keys._ + /** + * Creates [[Prediction]] given a prediction value + * + * @param prediction prediction value + * @return [[Prediction]] + */ def apply(prediction: Double): Prediction = new Prediction(Map(PredictionName -> prediction)) + /** + * Creates [[Prediction]] given a prediction value, raw prediction and probability values + * + * @param prediction prediction value + * @param rawPrediction raw prediction values + * @return [[Prediction]] + */ + def apply(prediction: Double, rawPrediction: Vector): Prediction = { + val rawPred = rawPrediction.toArray.zipWithIndex.map { case (v, i) => s"${RawPredictionName}_$i" -> v } + val pred = PredictionName -> prediction + new Prediction(rawPred.toMap + pred) + } + + /** + * Creates [[Prediction]] given a prediction value, raw prediction and probability values + * + * @param prediction prediction value + * @param rawPrediction raw prediction values + * @return [[Prediction]] + */ + def apply(prediction: Double, rawPrediction: Double): Prediction = { + val rawPred = s"${RawPredictionName}_0" -> rawPrediction + val pred = PredictionName -> prediction + new Prediction(Map(rawPred, pred)) + } + + /** + * Creates [[Prediction]] given a prediction value, raw prediction and probability values + * + * @param prediction prediction value + * @param rawPrediction raw prediction values + * @param probability probability values value + * @return [[Prediction]] + */ def apply(prediction: Double, rawPrediction: Vector, probability: Vector): Prediction = apply(prediction, rawPrediction = rawPrediction.toArray, probability = probability.toArray) + /** + * Creates [[Prediction]] given a prediction value, raw prediction and probability values + * + * @param prediction prediction value + * @param rawPrediction raw prediction values + * @param probability probability values value + * @return [[Prediction]] + */ def apply(prediction: Double, rawPrediction: Array[Double], probability: Array[Double]): Prediction = { val rawPred = rawPrediction.zipWithIndex.map { case (v, i) => s"${RawPredictionName}_$i" -> v } val prob = probability.zipWithIndex.map { case (v, i) => s"${ProbabilityName}_$i" -> v } diff --git a/features/src/main/scala/com/salesforce/op/features/types/Numerics.scala b/features/src/main/scala/com/salesforce/op/features/types/Numerics.scala index 0a9b7238bb..7d90c9a102 100644 --- a/features/src/main/scala/com/salesforce/op/features/types/Numerics.scala +++ b/features/src/main/scala/com/salesforce/op/features/types/Numerics.scala @@ -31,6 +31,13 @@ package com.salesforce.op.features.types +/** + * Real value representation + * + * A base class for all the real Feature Types + * + * @param value real + */ class Real(val value: Option[Double]) extends OPNumeric[Double] { def this(value: Double) = this(Option(value)) final def toDouble: Option[Double] = value @@ -42,8 +49,16 @@ object Real { def empty: Real = FeatureTypeDefaults.Real } -class RealNN private[op](v: Option[Double]) extends Real( - if (v == null || v.isEmpty) throw new NonNullableEmptyException(classOf[RealNN]) else v +/** + * Real non nullable value representation + * + * This value can only be constructed from a concrete [[Double]] value, + * if empty value is passed the [[NonNullableEmptyException]] is thrown. + * + * @param value real + */ +class RealNN private[op](value: Option[Double]) extends Real( + if (value == null || value.isEmpty) throw new NonNullableEmptyException(classOf[RealNN]) else value ) with NonNullable { def this(value: Double) = this(Option(value)) } @@ -51,6 +66,11 @@ object RealNN { def apply(value: Double): RealNN = new RealNN(value) } +/** + * Binary value representation + * + * @param value binary + */ class Binary(val value: Option[Boolean]) extends OPNumeric[Boolean] with SingleResponse { def this(value: Boolean) = this(Option(value)) final def toDouble: Option[Double] = value.map(if (_) 1.0 else 0.0) @@ -61,6 +81,13 @@ object Binary { def empty: Binary = FeatureTypeDefaults.Binary } +/** + * Integral value representation + * + * A base class for all the integral Feature Types + * + * @param value integral + */ class Integral(val value: Option[Long]) extends OPNumeric[Long] { def this(value: Long) = this(Option(value)) final def toDouble: Option[Double] = value.map(_.toDouble) @@ -71,6 +98,11 @@ object Integral { def empty: Integral = FeatureTypeDefaults.Integral } +/** + * Percentage value representation + * + * @param value percentage + */ class Percent(value: Option[Double]) extends Real(value) { def this(value: Double) = this(Option(value)) } @@ -80,6 +112,11 @@ object Percent { def empty: Percent = FeatureTypeDefaults.Percent } +/** + * Currency value representation + * + * @param value currency + */ class Currency(value: Option[Double]) extends Real(value) { def this(value: Double) = this(Option(value)) } @@ -89,6 +126,11 @@ object Currency { def empty: Currency = FeatureTypeDefaults.Currency } +/** + * Date value representation + * + * @param value date (assumed to be in ms since Epoch) + */ class Date(value: Option[Long]) extends Integral(value) { def this(value: Long) = this(Option(value)) } @@ -98,6 +140,11 @@ object Date { def empty: Date = FeatureTypeDefaults.Date } +/** + * Date & time value representation + * + * @param value date & time (assumed to be in ms since Epoch) + */ class DateTime(value: Option[Long]) extends Date(value) { def this(value: Long) = this(Option(value)) } diff --git a/features/src/main/scala/com/salesforce/op/features/types/OPCollection.scala b/features/src/main/scala/com/salesforce/op/features/types/OPCollection.scala index ef95e7cc9b..fb59a60933 100644 --- a/features/src/main/scala/com/salesforce/op/features/types/OPCollection.scala +++ b/features/src/main/scala/com/salesforce/op/features/types/OPCollection.scala @@ -33,6 +33,6 @@ package com.salesforce.op.features.types /** - * A base class for all the collections (arrays, lists, sets, maps, vector etc) + * A base class for all the collections Feature Types (maps, lists, sets, vector etc.) */ abstract class OPCollection extends FeatureType diff --git a/features/src/main/scala/com/salesforce/op/features/types/OPList.scala b/features/src/main/scala/com/salesforce/op/features/types/OPList.scala index 317f6a224c..70ae77c52f 100644 --- a/features/src/main/scala/com/salesforce/op/features/types/OPList.scala +++ b/features/src/main/scala/com/salesforce/op/features/types/OPList.scala @@ -34,11 +34,19 @@ package com.salesforce.op.features.types import scala.reflect.ClassTag /** - * A base class for all the list feature types + * A base class for all the list Feature Types + * * @tparam A item type */ abstract class OPList[A](implicit val cta: ClassTag[A]) extends OPCollection { override type Value = Seq[A] + final def isEmpty: Boolean = value.isEmpty + + /** + * Converts list to an array + * + * @return array of A + */ final def toArray: Array[A] = value.toArray(cta) } diff --git a/features/src/main/scala/com/salesforce/op/features/types/OPMap.scala b/features/src/main/scala/com/salesforce/op/features/types/OPMap.scala index f71c952dc7..a2374f39d9 100644 --- a/features/src/main/scala/com/salesforce/op/features/types/OPMap.scala +++ b/features/src/main/scala/com/salesforce/op/features/types/OPMap.scala @@ -32,11 +32,15 @@ package com.salesforce.op.features.types /** - * A base class for all the map feature types + * A base class for all the map Feature Types + * * @tparam A item type */ abstract class OPMap[A] extends OPCollection { type Element = A + override type Value = Map[String, A] + final def isEmpty: Boolean = value.isEmpty + } diff --git a/features/src/main/scala/com/salesforce/op/features/types/OPNumeric.scala b/features/src/main/scala/com/salesforce/op/features/types/OPNumeric.scala index f0c9e217fc..14f949a59b 100644 --- a/features/src/main/scala/com/salesforce/op/features/types/OPNumeric.scala +++ b/features/src/main/scala/com/salesforce/op/features/types/OPNumeric.scala @@ -33,11 +33,19 @@ package com.salesforce.op.features.types /** - * A base class for all the numeric feature types + * A base class for all the numeric Feature Types + * * @tparam N number type (Long, Double etc) */ abstract class OPNumeric[N] extends FeatureType { type Value = Option[N] + + /** + * Convert numeric value to [[Double]] representation + * + * @return [[Double]] representation of numeric value + */ def toDouble: Option[Double] + final def isEmpty: Boolean = value.isEmpty } diff --git a/features/src/main/scala/com/salesforce/op/features/types/OPSet.scala b/features/src/main/scala/com/salesforce/op/features/types/OPSet.scala index 10e2b5bcc0..3f4f80d010 100644 --- a/features/src/main/scala/com/salesforce/op/features/types/OPSet.scala +++ b/features/src/main/scala/com/salesforce/op/features/types/OPSet.scala @@ -35,10 +35,16 @@ import scala.reflect.ClassTag /** - * A base class for all the set feature types + * A base class for all the set Feature Types */ abstract class OPSet[A](implicit val cta: ClassTag[A]) extends OPCollection with MultiResponse { type Value <: scala.collection.Set[A] final def isEmpty: Boolean = value.isEmpty + + /** + * Converts set to an array + * + * @return array of A + */ final def toArray: Array[A] = value.toArray(cta) } diff --git a/features/src/main/scala/com/salesforce/op/features/types/OPVector.scala b/features/src/main/scala/com/salesforce/op/features/types/OPVector.scala index 5b39d7fb11..64417d88ff 100644 --- a/features/src/main/scala/com/salesforce/op/features/types/OPVector.scala +++ b/features/src/main/scala/com/salesforce/op/features/types/OPVector.scala @@ -31,9 +31,13 @@ package com.salesforce.op.features.types -import org.apache.spark.ml.linalg.Vector - +import org.apache.spark.ml.linalg._ +/** + * Vector representation + * + * @param value vector ([[SparseVector]] or [[DenseVector]]) + */ class OPVector(val value: Vector) extends OPCollection { type Value = Vector final def isEmpty: Boolean = value.size == 0 diff --git a/features/src/main/scala/com/salesforce/op/features/types/Sets.scala b/features/src/main/scala/com/salesforce/op/features/types/Sets.scala index 231dd4fa4d..b188699500 100644 --- a/features/src/main/scala/com/salesforce/op/features/types/Sets.scala +++ b/features/src/main/scala/com/salesforce/op/features/types/Sets.scala @@ -31,7 +31,11 @@ package com.salesforce.op.features.types - +/** + * Multi picklist value that represents a multiple selection from a set of values + * + * @param value multiple selection from a set of values + */ class MultiPickList(val value: Set[String]) extends OPSet[String] { type Value = Set[String] } diff --git a/features/src/main/scala/com/salesforce/op/features/types/Text.scala b/features/src/main/scala/com/salesforce/op/features/types/Text.scala index 06fa778652..279fa8243e 100644 --- a/features/src/main/scala/com/salesforce/op/features/types/Text.scala +++ b/features/src/main/scala/com/salesforce/op/features/types/Text.scala @@ -40,7 +40,9 @@ import org.apache.commons.io.input.CharSequenceInputStream import org.apache.commons.validator.routines.UrlValidator /** - * A base class for all the text feature types + * Text value representation + * + * A base class for all the text Feature Types * * @param value text value */ @@ -56,9 +58,22 @@ object Text { def empty: Text = FeatureTypeDefaults.Text } +/** + * Email value representation + * + * @param value email value + */ class Email(value: Option[String]) extends Text(value) { def this(value: String) = this(Option(value)) + /** + * Extract email prefix + * @return if email is invalid or empty - None is returned; otherwise some value with prefix + */ def prefix: Option[String] = Email.prefixOrDomain(this, isPrefix = true) + /** + * Extract email domain + * @return if email is invalid or empty - None is returned; otherwise some value with domain + */ def domain: Option[String] = Email.prefixOrDomain(this, isPrefix = false) } object Email { @@ -79,7 +94,11 @@ object Email { if (!m.matches()) None else if (isPrefix) Option(m.group(1)) else Option(m.group(2)) ) } - +/** + * Base64 encoded binary value representation + * + * @param value base64 encoded binary value + */ class Base64(value: Option[String]) extends Text(value) { def this(value: String) = this(Option(value)) /** @@ -107,13 +126,17 @@ class Base64(value: Option[String]) extends Text(value) { */ def asString: Option[String] = asBytes map (new String(_)) } - object Base64 { def apply(value: Option[String]): Base64 = new Base64(value) def apply(value: String): Base64 = new Base64(value) def empty: Base64 = FeatureTypeDefaults.Base64 } +/** + * Phone number value representation, i.e. '+1-650-113-111-2222' + * + * @param value phone number + */ class Phone(value: Option[String]) extends Text(value){ def this(value: String) = this(Option(value)) } @@ -123,6 +146,11 @@ object Phone { def empty: Phone = FeatureTypeDefaults.Phone } +/** + * Unique identifier value representation + * + * @param value unique identifier + */ class ID(value: Option[String]) extends Text(value){ def this(value: String) = this(Option(value)) } @@ -132,6 +160,11 @@ object ID { def empty: ID = FeatureTypeDefaults.ID } +/** + * URL value representation + * + * @param value url + */ class URL(value: Option[String]) extends Text(value){ def this(value: String) = this(Option(value)) /** @@ -147,7 +180,7 @@ class URL(value: Option[String]) extends Text(value){ */ def isValid(protocols: Array[String]): Boolean = value.exists(new UrlValidator(protocols).isValid) /** - * Extracts url domain, i.e. salesforce.com, data.com etc. + * Extracts url domain, i.e. 'salesforce.com', 'data.com' etc. */ def domain: Option[String] = value map (new java.net.URL(_).getHost) /** @@ -161,6 +194,11 @@ object URL { def empty: URL = FeatureTypeDefaults.URL } +/** + * Large text values (more than 4000 bytes) + * + * @param value large text value + */ class TextArea(value: Option[String]) extends Text(value){ def this(value: String) = this(Option(value)) } @@ -170,6 +208,11 @@ object TextArea { def empty: TextArea = FeatureTypeDefaults.TextArea } +/** + * A single text value that represents a single selection from a set of values + * + * @param value selected text + */ class PickList(value: Option[String]) extends Text(value) with SingleResponse { def this(value: String) = this(Option(value)) } @@ -178,7 +221,11 @@ object PickList { def apply(value: String): PickList = new PickList(value) def empty: PickList = FeatureTypeDefaults.PickList } - +/** + * A single text value that represents a selection from a set of values or a user specified one + * + * @param value selected or user specified text + */ class ComboBox(value: Option[String]) extends Text(value){ def this(value: String) = this(Option(value)) } @@ -188,6 +235,11 @@ object ComboBox { def empty: ComboBox = FeatureTypeDefaults.ComboBox } +/** + * Country value representation, i.e. 'United States of America', 'France" etc. + * + * @param value country + */ class Country(value: Option[String]) extends Text(value) with Location { def this(value: String) = this(Option(value)) } @@ -197,6 +249,11 @@ object Country { def empty: Country = FeatureTypeDefaults.Country } +/** + * State value representation, i.e. 'CA', 'OR' etc. + * + * @param value state + */ class State(value: Option[String]) extends Text(value) with Location { def this(value: String) = this(Option(value)) } @@ -206,6 +263,11 @@ object State { def empty: State = FeatureTypeDefaults.State } +/** + * Postal code value representation, i.e. '92101', '72212-341' etc. + * + * @param value postal code + */ class PostalCode(value: Option[String]) extends Text(value) with Location { def this(value: String) = this(Option(value)) } @@ -215,6 +277,11 @@ object PostalCode { def empty: PostalCode = FeatureTypeDefaults.PostalCode } +/** + * City value representation, i.e. 'New York', 'Paris' etc. + * + * @param value city + */ class City(value: Option[String]) extends Text(value) with Location { def this(value: String) = this(Option(value)) } @@ -224,6 +291,11 @@ object City { def empty: City = FeatureTypeDefaults.City } +/** + * Street representation, i.e. '123 University Ave' etc. + * + * @param value street + */ class Street(value: Option[String]) extends Text(value) with Location { def this(value: String) = this(Option(value)) } diff --git a/features/src/main/scala/com/salesforce/op/stages/OpPipelineStageParams.scala b/features/src/main/scala/com/salesforce/op/stages/OpPipelineStageParams.scala index e4f40b3514..7f0a121102 100644 --- a/features/src/main/scala/com/salesforce/op/stages/OpPipelineStageParams.scala +++ b/features/src/main/scala/com/salesforce/op/stages/OpPipelineStageParams.scala @@ -38,7 +38,7 @@ import org.apache.spark.sql.types.{Metadata, StructType} /** - * Parameters and functionalities shared across the input features + * Parameters and functions shared across the input features */ trait InputParams extends Params { diff --git a/features/src/main/scala/com/salesforce/op/stages/OpPipelineStageWriter.scala b/features/src/main/scala/com/salesforce/op/stages/OpPipelineStageWriter.scala index 38f41e516a..30d7edb9a1 100644 --- a/features/src/main/scala/com/salesforce/op/stages/OpPipelineStageWriter.scala +++ b/features/src/main/scala/com/salesforce/op/stages/OpPipelineStageWriter.scala @@ -77,7 +77,7 @@ final class OpPipelineStageWriter(val stage: OpPipelineStageBase) extends MLWrit */ def writeToMap: Map[String, Any] = { // We produce stage metadata for all the Spark params - val metadataJson = SparkDefaultParamsReadWrite.getMetadataToSave(stage, sc) + val metadataJson = SparkDefaultParamsReadWrite.getMetadataToSave(stage) // Add isModel indicator val metadata = parse(metadataJson).extract[Map[String, Any]] + (FieldNames.IsModel.entryName -> isModel) // In case we stumbled upon a model instance, we also include it's ctor args @@ -102,10 +102,10 @@ final class OpPipelineStageWriter(val stage: OpPipelineStageBase) extends MLWrit val anyValue = argValue match { // Special handling for Feature Type TypeTags case t: TypeTag[_] if FeatureType.isFeatureType(t) || FeatureType.isFeatureValueType(t) => - AnyValue(`type` = AnyValueTypes.TypeTag, value = t.tpe.dealias.toString) + AnyValue(`type` = AnyValueTypes.TypeTag, value = ReflectionUtils.dealisedTypeName(t.tpe)) case t: TypeTag[_] => throw new RuntimeException( - s"Unknown type tag '${t.tpe.dealias.toString}'. " + + s"Unknown type tag '${t.tpe.toString}'. " + "Only Feature and Feature Value type tags are supported for serialization." ) diff --git a/features/src/main/scala/com/salesforce/op/stages/OpPipelineStages.scala b/features/src/main/scala/com/salesforce/op/stages/OpPipelineStages.scala index df2e5ed98d..e23669cd29 100644 --- a/features/src/main/scala/com/salesforce/op/stages/OpPipelineStages.scala +++ b/features/src/main/scala/com/salesforce/op/stages/OpPipelineStages.scala @@ -32,9 +32,10 @@ package com.salesforce.op.stages import com.salesforce.op.features._ -import com.salesforce.op.features.types.{FeatureType, OPVector} +import com.salesforce.op.features.types.FeatureType import com.salesforce.op.utils.reflection.ReflectionUtils import com.salesforce.op.utils.spark.RichDataType._ +import com.salesforce.op.utils.spark.RichRow._ import org.apache.spark.ml.param._ import org.apache.spark.ml.util.{MLWritable, MLWriter} import org.apache.spark.ml.{PipelineStage, Transformer} @@ -203,7 +204,7 @@ trait OpPipelineStage[O <: FeatureType] extends OpPipelineStageBase { * Should output feature be a response? Yes, if any of the input features are. * @return true if the the output feature should be a response */ - protected def outputIsResponse: Boolean = getTransientFeatures().exists(_.isResponse) + def outputIsResponse: Boolean = getTransientFeatures().exists(_.isResponse) } @@ -553,12 +554,28 @@ trait OpPipelineStageN[I <: FeatureType, O <: FeatureType] extends OpPipelineSta * Trait to mix into transformers that indicates their transform functions can be combined into a single stage */ private[op] trait OpTransformer { - self: OpPipelineStage[_] with Transformer => /** - * Creates a transform function to convert Row to a value - * @return a transform function to convert Row to a value + * Feature name (key) -> value lookup, e.g Row, Map etc. + */ + type KeyValue = String => Any + + /** + * Creates a transform function to transform Row to a value + * @return a transform function to transform Row to a value + */ + def transformRow: Row => Any = r => transformKeyValue(r.getAny) + + /** + * Creates a transform function to transform Map to a value + * @return a transform function to transform Map to a value + */ + def transformMap: Map[String, Any] => Any = m => transformKeyValue(m.apply) + + /** + * Creates a transform function to transform any key/value to a value + * @return a transform function to transform any key/value to a value */ - def transformRow: Row => Any + def transformKeyValue: KeyValue => Any } diff --git a/core/src/main/scala/com/salesforce/op/stages/base/binary/BinaryEstimator.scala b/features/src/main/scala/com/salesforce/op/stages/base/binary/BinaryEstimator.scala similarity index 99% rename from core/src/main/scala/com/salesforce/op/stages/base/binary/BinaryEstimator.scala rename to features/src/main/scala/com/salesforce/op/stages/base/binary/BinaryEstimator.scala index 0a463eb482..1950d28286 100644 --- a/core/src/main/scala/com/salesforce/op/stages/base/binary/BinaryEstimator.scala +++ b/features/src/main/scala/com/salesforce/op/stages/base/binary/BinaryEstimator.scala @@ -35,6 +35,7 @@ import com.salesforce.op.features.FeatureSparkTypes import com.salesforce.op.features.types.{FeatureType, FeatureTypeSparkConverter} import com.salesforce.op.stages.OpPipelineStage2 import org.apache.spark.ml.{Estimator, Model} +import org.apache.spark.sql.types.StructType import org.apache.spark.sql.{Dataset, Encoder, Encoders} import org.apache.spark.util.ClosureUtils diff --git a/core/src/main/scala/com/salesforce/op/stages/base/binary/BinaryTransformer.scala b/features/src/main/scala/com/salesforce/op/stages/base/binary/BinaryTransformer.scala similarity index 96% rename from core/src/main/scala/com/salesforce/op/stages/base/binary/BinaryTransformer.scala rename to features/src/main/scala/com/salesforce/op/stages/base/binary/BinaryTransformer.scala index 55ccd97a38..4b2b70702e 100644 --- a/core/src/main/scala/com/salesforce/op/stages/base/binary/BinaryTransformer.scala +++ b/features/src/main/scala/com/salesforce/op/stages/base/binary/BinaryTransformer.scala @@ -35,10 +35,9 @@ import com.salesforce.op.UID import com.salesforce.op.features.FeatureSparkTypes import com.salesforce.op.features.types._ import com.salesforce.op.stages.{OpPipelineStage2, OpTransformer} -import com.salesforce.op.utils.spark.RichRow._ import org.apache.spark.ml.Transformer import org.apache.spark.sql.functions._ -import org.apache.spark.sql.{DataFrame, Dataset, Row} +import org.apache.spark.sql.{DataFrame, Dataset} import org.apache.spark.util.ClosureUtils import scala.reflect.runtime.universe.TypeTag @@ -85,9 +84,9 @@ trait OpTransformer2[I1 <: FeatureType, I2 <: FeatureType, O <: FeatureType] } private val transform2Fn = FeatureSparkTypes.transform2[I1, I2, O](transformFn) - override def transformRow: Row => Any = { + override def transformKeyValue: KeyValue => Any = { val (in1name, in2name) = (in1.name, in2.name) - (row: Row) => transform2Fn(row.getAny(in1name), row.getAny(in2name)) + (kv: KeyValue) => transform2Fn(kv(in1name), kv(in2name)) } } diff --git a/core/src/main/scala/com/salesforce/op/stages/base/quaternary/QuaternaryEstimator.scala b/features/src/main/scala/com/salesforce/op/stages/base/quaternary/QuaternaryEstimator.scala similarity index 100% rename from core/src/main/scala/com/salesforce/op/stages/base/quaternary/QuaternaryEstimator.scala rename to features/src/main/scala/com/salesforce/op/stages/base/quaternary/QuaternaryEstimator.scala diff --git a/core/src/main/scala/com/salesforce/op/stages/base/quaternary/QuaternaryTransformer.scala b/features/src/main/scala/com/salesforce/op/stages/base/quaternary/QuaternaryTransformer.scala similarity index 96% rename from core/src/main/scala/com/salesforce/op/stages/base/quaternary/QuaternaryTransformer.scala rename to features/src/main/scala/com/salesforce/op/stages/base/quaternary/QuaternaryTransformer.scala index ac487b3ace..19286cd942 100644 --- a/core/src/main/scala/com/salesforce/op/stages/base/quaternary/QuaternaryTransformer.scala +++ b/features/src/main/scala/com/salesforce/op/stages/base/quaternary/QuaternaryTransformer.scala @@ -35,10 +35,9 @@ import com.salesforce.op.UID import com.salesforce.op.features.FeatureSparkTypes import com.salesforce.op.features.types.FeatureType import com.salesforce.op.stages.{OpPipelineStage4, OpTransformer} -import com.salesforce.op.utils.spark.RichRow._ import org.apache.spark.ml.Transformer import org.apache.spark.sql.functions._ -import org.apache.spark.sql.{DataFrame, Dataset, Row} +import org.apache.spark.sql.{DataFrame, Dataset} import org.apache.spark.util.ClosureUtils import scala.reflect.runtime.universe.TypeTag @@ -90,9 +89,9 @@ trait OpTransformer4[I1 <: FeatureType, I2 <: FeatureType, I3 <: FeatureType, I4 } private val transform4Fn = FeatureSparkTypes.transform4[I1, I2, I3, I4, O](transformFn) - override def transformRow: Row => Any = { + override def transformKeyValue: KeyValue => Any = { val (in1name, in2name, in3name, in4name) = (in1.name, in2.name, in3.name, in4.name) - (row: Row) => transform4Fn(row.getAny(in1name), row.getAny(in2name), row.getAny(in3name), row.getAny(in4name)) + (kv: KeyValue) => transform4Fn(kv(in1name), kv(in2name), kv(in3name), kv(in4name)) } } diff --git a/core/src/main/scala/com/salesforce/op/stages/base/sequence/SequenceEstimator.scala b/features/src/main/scala/com/salesforce/op/stages/base/sequence/SequenceEstimator.scala similarity index 100% rename from core/src/main/scala/com/salesforce/op/stages/base/sequence/SequenceEstimator.scala rename to features/src/main/scala/com/salesforce/op/stages/base/sequence/SequenceEstimator.scala diff --git a/core/src/main/scala/com/salesforce/op/stages/base/sequence/SequenceTransformer.scala b/features/src/main/scala/com/salesforce/op/stages/base/sequence/SequenceTransformer.scala similarity index 96% rename from core/src/main/scala/com/salesforce/op/stages/base/sequence/SequenceTransformer.scala rename to features/src/main/scala/com/salesforce/op/stages/base/sequence/SequenceTransformer.scala index 8126de32b1..8c68aa7eff 100644 --- a/core/src/main/scala/com/salesforce/op/stages/base/sequence/SequenceTransformer.scala +++ b/features/src/main/scala/com/salesforce/op/stages/base/sequence/SequenceTransformer.scala @@ -35,10 +35,9 @@ import com.salesforce.op.UID import com.salesforce.op.features.FeatureSparkTypes import com.salesforce.op.features.types.FeatureType import com.salesforce.op.stages.{OpPipelineStageN, OpTransformer} -import com.salesforce.op.utils.spark.RichRow._ import org.apache.spark.ml.Transformer import org.apache.spark.sql.functions._ -import org.apache.spark.sql.{DataFrame, Dataset, Row} +import org.apache.spark.sql.{DataFrame, Dataset} import org.apache.spark.util.ClosureUtils import scala.reflect.runtime.universe.TypeTag @@ -84,9 +83,9 @@ trait OpTransformerN[I <: FeatureType, O <: FeatureType] } private val transformNFn = FeatureSparkTypes.transformN[I, O](transformFn) - override def transformRow: Row => Any = { + override def transformKeyValue: KeyValue => Any = { val inNames = inN.map(_.name) - (row: Row) => transformNFn(inNames.map(name => row.getAny(name))) + (kv: KeyValue) => transformNFn(inNames.map(name => kv(name))) } } diff --git a/core/src/main/scala/com/salesforce/op/stages/base/ternary/TernaryEstimator.scala b/features/src/main/scala/com/salesforce/op/stages/base/ternary/TernaryEstimator.scala similarity index 100% rename from core/src/main/scala/com/salesforce/op/stages/base/ternary/TernaryEstimator.scala rename to features/src/main/scala/com/salesforce/op/stages/base/ternary/TernaryEstimator.scala diff --git a/core/src/main/scala/com/salesforce/op/stages/base/ternary/TernaryTransformer.scala b/features/src/main/scala/com/salesforce/op/stages/base/ternary/TernaryTransformer.scala similarity index 96% rename from core/src/main/scala/com/salesforce/op/stages/base/ternary/TernaryTransformer.scala rename to features/src/main/scala/com/salesforce/op/stages/base/ternary/TernaryTransformer.scala index 6feabd6d69..1d03d14a41 100644 --- a/core/src/main/scala/com/salesforce/op/stages/base/ternary/TernaryTransformer.scala +++ b/features/src/main/scala/com/salesforce/op/stages/base/ternary/TernaryTransformer.scala @@ -35,10 +35,9 @@ import com.salesforce.op.UID import com.salesforce.op.features.FeatureSparkTypes import com.salesforce.op.features.types.FeatureType import com.salesforce.op.stages.{OpPipelineStage3, OpTransformer} -import com.salesforce.op.utils.spark.RichRow._ import org.apache.spark.ml.Transformer import org.apache.spark.sql.functions._ -import org.apache.spark.sql.{DataFrame, Dataset, Row} +import org.apache.spark.sql.{DataFrame, Dataset} import org.apache.spark.util.ClosureUtils import scala.reflect.runtime.universe.TypeTag @@ -86,9 +85,9 @@ trait OpTransformer3[I1 <: FeatureType, I2 <: FeatureType, I3 <: FeatureType, O } private val transform3Fn = FeatureSparkTypes.transform3[I1, I2, I3, O](transformFn) - override def transformRow: Row => Any = { + override def transformKeyValue: KeyValue => Any = { val (in1name, in2name, in3name) = (in1.name, in2.name, in3.name) - (row: Row) => transform3Fn(row.getAny(in1name), row.getAny(in2name), row.getAny(in3name)) + (kv: KeyValue) => transform3Fn(kv(in1name), kv(in2name), kv(in3name)) } } diff --git a/core/src/main/scala/com/salesforce/op/stages/base/unary/UnaryEstimator.scala b/features/src/main/scala/com/salesforce/op/stages/base/unary/UnaryEstimator.scala similarity index 100% rename from core/src/main/scala/com/salesforce/op/stages/base/unary/UnaryEstimator.scala rename to features/src/main/scala/com/salesforce/op/stages/base/unary/UnaryEstimator.scala diff --git a/core/src/main/scala/com/salesforce/op/stages/base/unary/UnaryTransformer.scala b/features/src/main/scala/com/salesforce/op/stages/base/unary/UnaryTransformer.scala similarity index 96% rename from core/src/main/scala/com/salesforce/op/stages/base/unary/UnaryTransformer.scala rename to features/src/main/scala/com/salesforce/op/stages/base/unary/UnaryTransformer.scala index bc6e4229f5..28004bcca3 100644 --- a/core/src/main/scala/com/salesforce/op/stages/base/unary/UnaryTransformer.scala +++ b/features/src/main/scala/com/salesforce/op/stages/base/unary/UnaryTransformer.scala @@ -35,10 +35,9 @@ import com.salesforce.op.UID import com.salesforce.op.features.FeatureSparkTypes import com.salesforce.op.features.types.FeatureType import com.salesforce.op.stages.{OpPipelineStage1, OpTransformer} -import com.salesforce.op.utils.spark.RichRow._ import org.apache.spark.ml.Transformer import org.apache.spark.sql.functions._ -import org.apache.spark.sql.{DataFrame, Dataset, Row} +import org.apache.spark.sql.{DataFrame, Dataset} import org.apache.spark.util.ClosureUtils import scala.reflect.runtime.universe.TypeTag @@ -82,9 +81,9 @@ trait OpTransformer1[I <: FeatureType, O <: FeatureType] } private val transform1Fn = FeatureSparkTypes.transform1[I, O](transformFn) - override def transformRow: Row => Any = { + override def transformKeyValue: KeyValue => Any = { val inName = in1.name - (r: Row) => transform1Fn(r.getAny(inName)) + (kv: KeyValue) => transform1Fn(kv(inName)) } } diff --git a/core/src/main/scala/com/salesforce/op/stages/sparkwrappers/generic/SparkWrapperParams.scala b/features/src/main/scala/com/salesforce/op/stages/sparkwrappers/generic/SparkWrapperParams.scala similarity index 95% rename from core/src/main/scala/com/salesforce/op/stages/sparkwrappers/generic/SparkWrapperParams.scala rename to features/src/main/scala/com/salesforce/op/stages/sparkwrappers/generic/SparkWrapperParams.scala index d08373e710..4977a8b345 100644 --- a/core/src/main/scala/com/salesforce/op/stages/sparkwrappers/generic/SparkWrapperParams.scala +++ b/features/src/main/scala/com/salesforce/op/stages/sparkwrappers/generic/SparkWrapperParams.scala @@ -32,8 +32,8 @@ package com.salesforce.op.stages.sparkwrappers.generic import com.salesforce.op.stages.SparkStageParam -import org.apache.spark.ml.param.{Param, Params, StringArrayParam} import org.apache.spark.ml.PipelineStage +import org.apache.spark.ml.param.{Param, Params, StringArrayParam} /** @@ -41,7 +41,7 @@ import org.apache.spark.ml.PipelineStage * * @tparam S type of spark object to wrap */ -private[op] trait SparkWrapperParams[S <: PipelineStage with Params] extends Params { +trait SparkWrapperParams[S <: PipelineStage with Params] extends Params { self: PipelineStage => final val sparkInputColParamNames = new StringArrayParam( @@ -59,7 +59,7 @@ private[op] trait SparkWrapperParams[S <: PipelineStage with Params] extends Par /** * this must be private so that the stage can have it's path set properly */ - private final val savePath = new Param[String]( + final val savePath = new Param[String]( parent = this, name = "savePath", doc = "path to save the spark stage" ) @@ -68,7 +68,7 @@ private[op] trait SparkWrapperParams[S <: PipelineStage with Params] extends Par /** * this must be private so that the stage can have it's path set properly */ - private final val sparkMlStage = new SparkStageParam[S]( + final val sparkMlStage = new SparkStageParam[S]( parent = this, name = "sparkMlStage", doc = "the spark stage that is being wrapped for optimus prime" ) @@ -80,7 +80,7 @@ private[op] trait SparkWrapperParams[S <: PipelineStage with Params] extends Par this } - def setSparkMlStage(stage: Option[S]): this.type = { + protected def setSparkMlStage(stage: Option[S]): this.type = { set(sparkMlStage, stage) sparkMlStage.savePath = Option($(savePath)) this diff --git a/core/src/main/scala/com/salesforce/op/test/FeatureTestBase.scala b/features/src/main/scala/com/salesforce/op/test/FeatureTestBase.scala similarity index 100% rename from core/src/main/scala/com/salesforce/op/test/FeatureTestBase.scala rename to features/src/main/scala/com/salesforce/op/test/FeatureTestBase.scala diff --git a/features/src/main/scala/com/salesforce/op/test/FeatureTypeEquality.scala b/features/src/main/scala/com/salesforce/op/test/FeatureTypeEquality.scala new file mode 100644 index 0000000000..d254c6de48 --- /dev/null +++ b/features/src/main/scala/com/salesforce/op/test/FeatureTypeEquality.scala @@ -0,0 +1,64 @@ +/* + * Copyright (c) 2017, Salesforce.com, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of Salesforce.com nor the names of its contributors may + * be used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +package com.salesforce.op.test + +import com.salesforce.op.features.types.FeatureType +import org.scalactic.Equality +import org.scalatest.Suite + +/** + * Feature Type equality instances mixin. + * Allowing users to customize equality in tests, for example to allow numerical tolerance. + * + * @tparam O feature type + */ +trait FeatureTypeEquality[O <: FeatureType] { + self: Suite => + + /** + * Feature type equality + */ + implicit val featureTypeEquality: Equality[O] = new Equality[O] { + def areEqual(a: O, b: Any): Boolean = a.equals(b) + } + + /** + * Feature type sequence equality + */ + implicit val seqEquality: Equality[Seq[O]] = new Equality[Seq[O]] { + def areEqual(a: Seq[O], b: Any): Boolean = b match { + case s: Seq[_] if a.size == s.size => a.zip(s).forall { case (av, bv) => av === bv } + case _ => false + } + } + +} diff --git a/features/src/main/scala/com/salesforce/op/test/OpEstimatorSpec.scala b/features/src/main/scala/com/salesforce/op/test/OpEstimatorSpec.scala new file mode 100644 index 0000000000..51de105af6 --- /dev/null +++ b/features/src/main/scala/com/salesforce/op/test/OpEstimatorSpec.scala @@ -0,0 +1,145 @@ +/* + * Copyright (c) 2017, Salesforce.com, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of Salesforce.com nor the names of its contributors may + * be used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +package com.salesforce.op.test + +import java.io.File + +import com.salesforce.op.features.types._ +import com.salesforce.op.stages._ +import org.apache.spark.ml.{Estimator, Model} +import org.apache.spark.sql.Dataset +import org.scalactic.Equality +import org.scalatest.events.{Event, TestFailed} +import org.scalatest.{Args, Reporter} + +import scala.collection.mutable.ArrayBuffer +import scala.reflect._ +import scala.reflect.runtime.universe._ + +/** + * Base test class for testing OP estimator instances. + * Includes common tests for fitting estimator and verifying the fitted model. + * + * @tparam O output feature type + * @tparam ModelType model type produced by this estimator + * @tparam EstimatorType type of the estimator being tested + */ +abstract class OpEstimatorSpec[O <: FeatureType : WeakTypeTag : ClassTag, +ModelType <: Model[ModelType] with OpPipelineStage[O] with OpTransformer : ClassTag, +EstimatorType <: Estimator[ModelType] with OpPipelineStage[O] : ClassTag] + extends OpPipelineStageSpec[O, EstimatorType] { + + /** + * Estimator instance to be tested + */ + val estimator: EstimatorType + + /** + * Input Dataset to fit & transform + */ + val inputData: Dataset[_] + + /** + * Expected result of the transformer applied on the Input Dataset + */ + val expectedResult: Seq[O] + + final override lazy val stage = estimator + + /** + * Model (transformer) to fit + */ + final lazy val model: ModelType = estimator.fit(inputData) + + it should "fit a model" in { + model should not be null + model shouldBe a[ModelType] + } + + it should behave like modelSpec() + + it should "have fitted a model that matches the estimator" in { + withClue("Model doesn't have a parent:") { + model.hasParent shouldBe true + } + withClue("Model parent should be the original estimator instance:") { + model.parent shouldBe estimator + } + withClue("Model and estimator output feature names don't match:") { + model.getOutputFeatureName shouldBe estimator.getOutputFeatureName + } + assert(model.asInstanceOf[OpPipelineStageBase], estimator, expectSameClass = false) + } + + // TODO: test metadata + + + /** + * Register all model spec tests + */ + private def modelSpec(): Unit = { + // Define transformer spec for the fitted model reusing the same inputs & Spark context + val modelSpec = new OpTransformerSpec[O, ModelType] { + override implicit val featureTypeEquality: Equality[O] = OpEstimatorSpec.this.featureTypeEquality + override implicit val seqEquality: Equality[Seq[O]] = OpEstimatorSpec.this.seqEquality + lazy val transformer: ModelType = model.setInputFeatureArray(estimator.getInputFeatures()) + lazy val inputData: Dataset[_] = OpEstimatorSpec.this.inputData + lazy val expectedResult: Seq[O] = OpEstimatorSpec.this.expectedResult + override implicit lazy val spark = OpEstimatorSpec.this.spark + override def specName: String = "model" + override def tempDir: File = OpEstimatorSpec.this.tempDir + } + + // Register all model spec tests + for { + testName <- modelSpec.testNames + } registerTest(testName) { + // Run test & collect failures + val failures = ArrayBuffer.empty[TestFailed] + val reporter = new Reporter { + def apply(event: Event): Unit = event match { + case f: TestFailed => failures += f + case _ => + } + } + // Note: We set 'runTestInNewInstance = true' to avoid restarting Spark context on every test run + val args = Args(reporter, runTestInNewInstance = true) + modelSpec.run(testName = Some(testName), args = args) + + // Propagate the failure if any + for {failure <- failures.headOption} { + failure.throwable.map(fail(failure.message, _)).getOrElse(fail(failure.message)) + } + } + } + +} diff --git a/features/src/main/scala/com/salesforce/op/test/OpPipelineStageSpec.scala b/features/src/main/scala/com/salesforce/op/test/OpPipelineStageSpec.scala new file mode 100644 index 0000000000..d5c674fc42 --- /dev/null +++ b/features/src/main/scala/com/salesforce/op/test/OpPipelineStageSpec.scala @@ -0,0 +1,187 @@ +/* + * Copyright (c) 2017, Salesforce.com, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of Salesforce.com nor the names of its contributors may + * be used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +package com.salesforce.op.test + +import com.salesforce.op.features.Feature +import com.salesforce.op.features.types._ +import com.salesforce.op.stages._ +import com.salesforce.op.stages.sparkwrappers.generic.SparkWrapperParams +import org.apache.spark.ml.Estimator +import org.apache.spark.ml.param.ParamMap +import org.scalatest._ + +import scala.reflect._ +import scala.reflect.runtime.universe._ +import scala.util.Failure + + +/** + * Spec for testing [[OpPipelineStage]] instances (transformers or estimators). + * Includes common tests for output feature, copy, serialization, json read/write etc. + * + * @tparam O output feature type + * @tparam StageType [[OpPipelineStage]] type being tested (transformer or estimator) + */ +abstract class OpPipelineStageSpec[O <: FeatureType : WeakTypeTag : ClassTag, +StageType <: OpPipelineStage[O] : ClassTag] + extends FlatSpec + with FeatureTypeEquality[O] + with TestSparkContext + with OpPipelineStageAsserts { + + /** + * [[OpPipelineStage]] instance to be tested + */ + val stage: StageType + + /** + * Spec name (StageType[O] by default) + */ + def specName: String = Spec[O, StageType] + + specName should "produce output feature" in { + val output = stage.getOutput() + output shouldBe new Feature[O]( + name = stage.getOutputFeatureName, + originStage = stage, + isResponse = stage.outputIsResponse, + parents = stage.getInputFeatures() + ) + } + it should "copy" in { + val copy = stage.copy(new ParamMap()) + copy shouldBe a[StageType] + assert(copy, stage) + } + it should "be serializable" in { + stage.checkSerializable match { + case Failure(e) => fail("Stage is not serializable", e) + case _ => + } + } + it should "be json writable/readable" in { + val loaded = writeAndRead(stage) + assert(loaded, stage) + } + + /** + * A helper function to write and read stage into savePath + * + * @param stage stage instance to write and then read + * @param savePath Spark stage save path + * @return read stage + */ + protected def writeAndRead(stage: StageType, savePath: String = stageSavePath): OpPipelineStageBase = { + val savable = stage match { + case s: SparkWrapperParams[_] => s.setSavePath(savePath) + case s => s + } + val json = new OpPipelineStageWriter(savable).overwrite().writeToJsonString + new OpPipelineStageReader(savable).loadFromJsonString(json) + } + + /** + * Spark stage save path + */ + protected def stageSavePath: String = s"$tempDir/${specName.filter(_.isLetterOrDigit)}-${System.currentTimeMillis()}" + +} + + +/** + * Stage assertion for [[OpPipelineStage]] + */ +trait OpPipelineStageAsserts extends AppendedClues { + self: Matchers => + + /** + * Assert stage instances + * + * @param stage instance to assert + * @param expected instance to assert against + * @param expectSameClass should expect the same class or not + * @return + */ + def assert(stage: OpPipelineStageBase, expected: OpPipelineStageBase, expectSameClass: Boolean = true): Assertion = { + def stageType(s: OpPipelineStageBase) = if (s.isInstanceOf[Estimator[_]]) "estimator" else "transformer" + lazy val stageClue = + if (expectSameClass) s", while asserting ${stage.getClass.getSimpleName} ${stageType(stage)}." + else { + s", while asserting ${stage.getClass.getSimpleName} ${stageType(stage)} " + + s"against ${expected.getClass.getSimpleName} ${stageType(expected)}." + } + def clue[T](msg: String)(fun: => T) = { withClue(msg)(fun) } withClue stageClue + + if (expectSameClass) { + clue("Stage classes don't match:") { + stage.getClass shouldBe expected.getClass + } + clue("Params are not the same:") { + stage.params should contain theSameElementsAs expected.params + } + expected.params.foreach { p => + clue(s"Param '${p.name}' should exist:") { + stage.hasParam(p.name) shouldBe expected.hasParam(p.name) + } + // TODO: add params value comparison (note: can be tricky) + // withClue(s"Param '${p.name}' values do not match:") { + // stage.get(p) shouldBe expected.get(p) + // } + } + } + clue("Stage UIDs don't match:") { + stage.uid shouldBe expected.uid + } + clue("Stage outputs don't match:") { + stage.getOutput() shouldBe expected.getOutput() + } + clue("Operation names don't match:") { + stage.operationName shouldBe expected.operationName + } + clue("Stage names don't match:") { + stage.stageName shouldBe expected.stageName + } + clue("Transient features don't match:") { + stage.getTransientFeatures() should contain theSameElementsAs expected.getTransientFeatures() + } + clue("Input features don't match:") { + stage.getInputFeatures() should contain theSameElementsAs expected.getInputFeatures() + } + clue("Input schemas don't match:") { + stage.getInputSchema() shouldBe expected.getInputSchema() + } + clue("Metadata values don't match:") { + stage.getMetadata() shouldBe expected.getMetadata() + } + } + +} diff --git a/features/src/main/scala/com/salesforce/op/test/OpTransformerSpec.scala b/features/src/main/scala/com/salesforce/op/test/OpTransformerSpec.scala new file mode 100644 index 0000000000..ed3934a9dc --- /dev/null +++ b/features/src/main/scala/com/salesforce/op/test/OpTransformerSpec.scala @@ -0,0 +1,116 @@ +/* + * Copyright (c) 2017, Salesforce.com, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of Salesforce.com nor the names of its contributors may + * be used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +package com.salesforce.op.test + +import com.salesforce.op.features.types._ +import com.salesforce.op.features.{FeatureLike, FeatureSparkTypes} +import com.salesforce.op.stages._ +import com.salesforce.op.utils.spark.RichDataset._ +import com.salesforce.op.utils.spark.RichRow._ +import org.apache.spark.ml.Transformer +import org.apache.spark.sql.Dataset + +import scala.reflect._ +import scala.reflect.runtime.universe._ + +/** + * Base test class for testing [[OpPipelineStage]] instances (transformers or estimators). + * Includes common tests for schema and data transformations. + * + * @tparam O output feature type + * @tparam TransformerType type of the transformer being tested + */ +abstract class OpTransformerSpec[O <: FeatureType : WeakTypeTag : ClassTag, +TransformerType <: OpPipelineStage[O] with Transformer with OpTransformer : ClassTag] + extends OpPipelineStageSpec[O, TransformerType] { + + /** + * [[OpTransformer]] instance to be tested + */ + val transformer: TransformerType + + /** + * Input Dataset to transform + */ + val inputData: Dataset[_] + + /** + * Expected result of the transformer applied on the Input Dataset + */ + val expectedResult: Seq[O] + + final override lazy val stage = transformer + protected val convert = FeatureTypeSparkConverter[O]() + + it should "transform schema" in { + val transformedSchema = transformer.transformSchema(inputData.schema) + val output = transformer.getOutput() + val validationResults = + FeatureSparkTypes.validateSchema(transformedSchema, transformer.getInputFeatures() :+ output) + if (validationResults.nonEmpty) { + fail("Dataset schema is invalid. Errors: " + validationResults.mkString("'", "','", "'")) + } + } + it should "transform data" in { + val transformed = transformer.transform(inputData) + val output = transformer.getOutput() + val res: Seq[O] = transformed.collect(output)(convert, classTag[O]).toSeq + res shouldEqual expectedResult + } + it should "transform rows" in { + val rows = inputData.toDF().collect() + val res: Seq[O] = rows.view.map(row => transformer.transformRow(row)).map(convert.fromSpark) + res shouldEqual expectedResult + } + it should "transform maps" in { + val rows = inputData.toDF().collect() + val inputNames = transformer.getTransientFeatures().map(_.name) + val maps = rows.view.map(row => inputNames.map(name => name -> row.getAny(name)).toMap) + val res: Seq[O] = maps.map(transformer.transformMap).map(convert.fromSpark) + res shouldEqual expectedResult + } + it should "transform key/value" in { + val rows = inputData.toDF().collect() + val res: Seq[O] = rows.view.map(row => transformer.transformKeyValue(row.getAny)).map(convert.fromSpark) + res shouldEqual expectedResult + } + it should "transform data after being loaded" in { + val loaded = writeAndRead(stage) + val transformed = loaded.asInstanceOf[TransformerType].transform(inputData) + val output = loaded.getOutput().asInstanceOf[FeatureLike[O]] + val res: Seq[O] = transformed.collect(output)(convert, classTag[O]).toSeq + res shouldEqual expectedResult + } + + // TODO: test metadata + +} diff --git a/core/src/main/scala/com/salesforce/op/test/TestOpVectorMetadataBuilder.scala b/features/src/main/scala/com/salesforce/op/test/TestOpVectorMetadataBuilder.scala similarity index 100% rename from core/src/main/scala/com/salesforce/op/test/TestOpVectorMetadataBuilder.scala rename to features/src/main/scala/com/salesforce/op/test/TestOpVectorMetadataBuilder.scala diff --git a/utils/src/main/scala/com/salesforce/op/utils/spark/OpVectorColumnHistory.scala b/features/src/main/scala/com/salesforce/op/utils/spark/OpVectorColumnHistory.scala similarity index 100% rename from utils/src/main/scala/com/salesforce/op/utils/spark/OpVectorColumnHistory.scala rename to features/src/main/scala/com/salesforce/op/utils/spark/OpVectorColumnHistory.scala diff --git a/utils/src/main/scala/com/salesforce/op/utils/spark/OpVectorColumnMetadata.scala b/features/src/main/scala/com/salesforce/op/utils/spark/OpVectorColumnMetadata.scala similarity index 91% rename from utils/src/main/scala/com/salesforce/op/utils/spark/OpVectorColumnMetadata.scala rename to features/src/main/scala/com/salesforce/op/utils/spark/OpVectorColumnMetadata.scala index 7bc8ed214a..6b5f39cee8 100644 --- a/utils/src/main/scala/com/salesforce/op/utils/spark/OpVectorColumnMetadata.scala +++ b/features/src/main/scala/com/salesforce/op/utils/spark/OpVectorColumnMetadata.scala @@ -31,10 +31,13 @@ package com.salesforce.op.utils.spark +import com.salesforce.op.features.types.{FeatureType, OPMap} import com.salesforce.op.utils.json.JsonLike import org.apache.spark.sql.types.{Metadata, MetadataBuilder} import com.salesforce.op.utils.spark.RichMetadata.{RichMetadata => RichMeta} +import scala.reflect.runtime.universe._ + /** * Represents the metadata a column in a vector. @@ -113,19 +116,20 @@ case class OpVectorColumnMetadata s"${indicatorValue.map("_" + _).getOrElse("")}_$index" /** - * Does column have parent features that are maps - * @return boolean indicating whether parent feature type sequence contains Map types + * Does column have parent features of specified feature type O */ - def hasMapParent(): Boolean = { - // TODO: move this class to `features` or `core` sub project to avoid mentioning types as strings - hasParentOfType("Map") || hasParentOfType("Prediction") - } + def hasParentOfType[O <: FeatureType](implicit tt: TypeTag[O]): Boolean = + parentFeatureType.exists { parentTypeName => + FeatureType.featureTypeTag(parentTypeName).tpe =:= tt.tpe + } /** - * Does column have parent features of specified feature type - * @return boolean indicating whether parent feature type sequence contains type name + * Does column have parent features of which are subtypes of feature type O */ - def hasParentOfType(typeName: String): Boolean = parentFeatureType.exists(_.contains(typeName)) + def hasParentOfSubType[O <: FeatureType](implicit tt: TypeTag[O]): Boolean = + parentFeatureType.exists { parentTypeName => + FeatureType.featureTypeTag(parentTypeName).tpe <:< tt.tpe + } /** * Return parent features names with the key (indicatorGroup) from any map parents included in name @@ -133,7 +137,7 @@ case class OpVectorColumnMetadata * for columns with map parent features */ def parentNamesWithMapKeys(): Seq[String] = - if (hasMapParent()) parentFeatureName.map(p => indicatorGroup.map(p + "_" + _).getOrElse(p)) + if (hasParentOfSubType[OPMap[_]]) parentFeatureName.map(p => indicatorGroup.map(p + "_" + _).getOrElse(p)) else parentFeatureName } diff --git a/utils/src/main/scala/com/salesforce/op/utils/spark/OpVectorMetadata.scala b/features/src/main/scala/com/salesforce/op/utils/spark/OpVectorMetadata.scala similarity index 100% rename from utils/src/main/scala/com/salesforce/op/utils/spark/OpVectorMetadata.scala rename to features/src/main/scala/com/salesforce/op/utils/spark/OpVectorMetadata.scala diff --git a/utils/src/main/scala/com/salesforce/op/utils/spark/RichStructType.scala b/features/src/main/scala/com/salesforce/op/utils/spark/RichStructType.scala similarity index 100% rename from utils/src/main/scala/com/salesforce/op/utils/spark/RichStructType.scala rename to features/src/main/scala/com/salesforce/op/utils/spark/RichStructType.scala diff --git a/features/src/main/scala/com/salesforce/op/utils/spark/RichVector.scala b/features/src/main/scala/com/salesforce/op/utils/spark/RichVector.scala new file mode 100644 index 0000000000..989e3f3f8a --- /dev/null +++ b/features/src/main/scala/com/salesforce/op/utils/spark/RichVector.scala @@ -0,0 +1,89 @@ +/* + * Copyright (c) 2017, Salesforce.com, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of Salesforce.com nor the names of its contributors may + * be used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +package com.salesforce.op.utils.spark + +import breeze.linalg.{DenseVector => BreezeDenseVector, SparseVector => BreezeSparseVector, Vector => BreezeVector} +import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector} + +/** + * [[org.apache.spark.ml.linalg.Vector]] enrichment functions + */ +object RichVector { + + implicit class RichVector(val v: Vector) extends AnyVal { + + /** + * Add vectors + * + * @param that another vector + * @throws IllegalArgumentException if the vectors have different sizes + * @return vector addition + */ + def +(that: Vector): Vector = { + val res = v.toBreeze + that.toBreeze + toSpark(res) + } + + /** + * Subtract vectors + * + * @param that another vector + * @throws IllegalArgumentException if the vectors have different sizes + * @return vector subtraction + */ + def -(that: Vector): Vector = { + val res = v.toBreeze - that.toBreeze + toSpark(res) + } + + /** + * Convert to [[breeze.linalg.Vector]] + * + * @return [[breeze.linalg.Vector]] + */ + def toBreeze: BreezeVector[Double] = v match { + case s: SparseVector => new BreezeSparseVector[Double](s.indices, s.values, s.size) + case d: DenseVector => new BreezeDenseVector[Double](d.values) + } + + /** + * Convert [[breeze.linalg.Vector]] back to [[org.apache.spark.ml.linalg.Vector]] + * @return [[org.apache.spark.ml.linalg.Vector]] + */ + private def toSpark: BreezeVector[Double] => Vector = { + case s: BreezeSparseVector[Double]@unchecked => new SparseVector(s.length, s.index, s.data) + case d: BreezeDenseVector[Double]@unchecked => new DenseVector(d.data) + } + + } + +} diff --git a/features/src/main/scala/org/apache/spark/ml/SparkDefaultParamsReadWrite.scala b/features/src/main/scala/org/apache/spark/ml/SparkDefaultParamsReadWrite.scala index 1154b31cc7..65d92a309b 100644 --- a/features/src/main/scala/org/apache/spark/ml/SparkDefaultParamsReadWrite.scala +++ b/features/src/main/scala/org/apache/spark/ml/SparkDefaultParamsReadWrite.scala @@ -1,40 +1,32 @@ +// scalastyle:off header.matches /* - * Copyright (c) 2017, Salesforce.com, Inc. - * All rights reserved. + * Modifications: (c) 2017, Salesforce.com, Inc. * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at * - * 1. Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. + * http://www.apache.org/licenses/LICENSE-2.0 * - * 2. Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - * - * 3. Neither the name of Salesforce.com nor the names of its contributors may - * be used to endorse or promote products derived from this software without - * specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ package org.apache.spark.ml import com.salesforce.op.stages.OpPipelineStageBase -import org.apache.spark.SparkContext +import org.apache.spark.ml.param.ParamPair import org.apache.spark.ml.util.DefaultParamsReader.{Metadata, loadMetadata} import org.apache.spark.ml.util.{DefaultParamsReader, DefaultParamsWriter} +import org.json4s.JsonDSL._ +import org.json4s._ +import org.json4s.jackson.JsonMethods._ /** * Direct wrappers for ml private [[DefaultParamsWriter]] and [[DefaultParamsReader]] @@ -43,13 +35,39 @@ import org.apache.spark.ml.util.{DefaultParamsReader, DefaultParamsWriter} case object SparkDefaultParamsReadWrite { /** - * Helper for [[saveMetadata()]] which extracts the JSON to save. + * Helper for [[OpPipelineStageWriter]] which extracts the JSON to save. * This is useful for ensemble models which need to save metadata for many sub-models. * - * @see [[saveMetadata()]] for details on what this includes. + * Note: this method was taken from DefaultParamsWriter.getMetadataToSave, + * but modified to avoid requiring Spark session + * + * @see [[OpPipelineStageWriter]] for details on what this includes. */ - def getMetadataToSave(stage: OpPipelineStageBase, sc: SparkContext): String = - DefaultParamsWriter.getMetadataToSave(stage, sc) + def getMetadataToSave( + stage: OpPipelineStageBase, + extraMetadata: Option[JObject] = None, + paramMap: Option[JValue] = None + ): String = { + val uid = stage.uid + val cls = stage.getClass.getName + val params = stage.extractParamMap().toSeq.asInstanceOf[Seq[ParamPair[Any]]] + val jsonParams = paramMap.getOrElse(render(params.map { case ParamPair(p, v) => + p.name -> parse(p.jsonEncode(v)) + }.toList)) + val basicMetadata = ("class" -> cls) ~ + ("timestamp" -> System.currentTimeMillis()) ~ + ("sparkVersion" -> org.apache.spark.SPARK_VERSION) ~ + ("uid" -> uid) ~ + ("paramMap" -> jsonParams) + val metadata = extraMetadata match { + case Some(jObject) => + basicMetadata ~ jObject + case None => + basicMetadata + } + val metadataJson: String = compact(render(metadata)) + metadataJson + } /** * Parse metadata JSON string produced by [[DefaultParamsWriter.getMetadataToSave()]]. diff --git a/features/src/test/resources/OpParamsWithAltReader.json b/features/src/test/resources/OpParamsWithAltReader.json index c1290ea795..88c87b627b 100644 --- a/features/src/test/resources/OpParamsWithAltReader.json +++ b/features/src/test/resources/OpParamsWithAltReader.json @@ -26,3 +26,4 @@ } } } + diff --git a/features/src/test/scala/com/salesforce/op/features/types/FeatureTypeFactoryTest.scala b/features/src/test/scala/com/salesforce/op/features/types/FeatureTypeFactoryTest.scala index baa39339d9..22e7437ba2 100644 --- a/features/src/test/scala/com/salesforce/op/features/types/FeatureTypeFactoryTest.scala +++ b/features/src/test/scala/com/salesforce/op/features/types/FeatureTypeFactoryTest.scala @@ -33,9 +33,10 @@ package com.salesforce.op.features.types import com.salesforce.op.test.TestCommon import org.junit.runner.RunWith -import org.scalatest.{Assertion, Matchers, PropSpec} +import org.scalactic.source import org.scalatest.junit.JUnitRunner import org.scalatest.prop.{PropertyChecks, TableFor1} +import org.scalatest.{Assertion, Matchers, PropSpec} import scala.concurrent.duration._ import scala.util.{Failure, Success, Try} @@ -127,12 +128,32 @@ class FeatureTypeFactoryTest trait FeatureTypeAsserts { self: Matchers => - def assertCreate(ft: => FeatureType): Assertion = Try(ft) match { - case Failure(e) => - e shouldBe a[NonNullableEmptyException] - case Success(v) => - v should not be null - v shouldBe a[FeatureType] + /** + * Asserts creation of the feature type value + * + * @param makeIt make block for feature + * @return [[Assertion]] + */ + def assertCreate(makeIt: => FeatureType)(implicit pos: source.Position): Assertion = + assertCreate(makeIt, (v: FeatureType) => assert(true)) + + /** + * Asserts creation of the feature type value + * + * @param makeIt make block for feature + * @param assertion optional assertion + * @return [[Assertion]] + */ + def assertCreate(makeIt: => FeatureType, assertion: FeatureType => Assertion) + (implicit pos: source.Position): Assertion = { + Try(makeIt) match { + case Failure(e) => + e shouldBe a[NonNullableEmptyException] + case Success(v) => + v should not be null + v shouldBe a[FeatureType] + assertion(v) + } } } diff --git a/features/src/test/scala/com/salesforce/op/features/types/FeatureTypeSparkConverterTest.scala b/features/src/test/scala/com/salesforce/op/features/types/FeatureTypeSparkConverterTest.scala index 178888a6d2..93fb4737bd 100644 --- a/features/src/test/scala/com/salesforce/op/features/types/FeatureTypeSparkConverterTest.scala +++ b/features/src/test/scala/com/salesforce/op/features/types/FeatureTypeSparkConverterTest.scala @@ -33,6 +33,7 @@ package com.salesforce.op.features.types import com.salesforce.op.test.TestCommon import org.junit.runner.RunWith +import org.scalacheck.Gen import org.scalatest.PropSpec import org.scalatest.junit.JUnitRunner import org.scalatest.prop.{PropertyChecks, TableFor1} @@ -44,60 +45,13 @@ import scala.concurrent.duration._ class FeatureTypeSparkConverterTest extends PropSpec with PropertyChecks with TestCommon with ConcurrentCheck with FeatureTypeAsserts { - val featureTypeConverters: TableFor1[FeatureTypeSparkConverter[_ <: FeatureType]] = Table("ft", - // Vector - FeatureTypeSparkConverter[OPVector](), - // Lists - FeatureTypeSparkConverter[TextList](), - FeatureTypeSparkConverter[DateList](), - FeatureTypeSparkConverter[DateTimeList](), - // Maps - FeatureTypeSparkConverter[Base64Map](), - FeatureTypeSparkConverter[BinaryMap](), - FeatureTypeSparkConverter[ComboBoxMap](), - FeatureTypeSparkConverter[CurrencyMap](), - FeatureTypeSparkConverter[DateMap](), - FeatureTypeSparkConverter[DateTimeMap](), - FeatureTypeSparkConverter[EmailMap](), - FeatureTypeSparkConverter[IDMap](), - FeatureTypeSparkConverter[IntegralMap](), - FeatureTypeSparkConverter[MultiPickListMap](), - FeatureTypeSparkConverter[PercentMap](), - FeatureTypeSparkConverter[PhoneMap](), - FeatureTypeSparkConverter[PickListMap](), - FeatureTypeSparkConverter[RealMap](), - FeatureTypeSparkConverter[TextAreaMap](), - FeatureTypeSparkConverter[TextMap](), - FeatureTypeSparkConverter[URLMap](), - FeatureTypeSparkConverter[CountryMap](), - FeatureTypeSparkConverter[StateMap](), - FeatureTypeSparkConverter[CityMap](), - FeatureTypeSparkConverter[PostalCodeMap](), - FeatureTypeSparkConverter[StreetMap](), - FeatureTypeSparkConverter[GeolocationMap](), - FeatureTypeSparkConverter[Prediction](), - // Numerics - FeatureTypeSparkConverter[Binary](), - FeatureTypeSparkConverter[Currency](), - FeatureTypeSparkConverter[Date](), - FeatureTypeSparkConverter[DateTime](), - FeatureTypeSparkConverter[Integral](), - FeatureTypeSparkConverter[Percent](), - FeatureTypeSparkConverter[Real](), - FeatureTypeSparkConverter[RealNN](), - // Sets - FeatureTypeSparkConverter[MultiPickList](), - // Text - FeatureTypeSparkConverter[Base64](), - FeatureTypeSparkConverter[ComboBox](), - FeatureTypeSparkConverter[Email](), - FeatureTypeSparkConverter[ID](), - FeatureTypeSparkConverter[Phone](), - FeatureTypeSparkConverter[PickList](), - FeatureTypeSparkConverter[Text](), - FeatureTypeSparkConverter[TextArea](), - FeatureTypeSparkConverter[URL]() + val featureTypeConverters: TableFor1[FeatureTypeSparkConverter[_ <: FeatureType]] = Table("ftc", + FeatureTypeSparkConverter.featureTypeSparkConverters.values.toSeq: _* ) + val featureTypeNames: TableFor1[String] = Table("ftnames", + FeatureTypeSparkConverter.featureTypeSparkConverters.keys.toSeq: _* + ) + val bogusNames = Gen.alphaNumStr property("is a feature type converter") { forAll(featureTypeConverters) { ft => ft shouldBe a[FeatureTypeSparkConverter[_]] } @@ -105,14 +59,40 @@ class FeatureTypeSparkConverterTest property("is serializable") { forAll(featureTypeConverters) { ft => ft shouldBe a[Serializable] } } + property("make a converter by feature type name") { + forAll(featureTypeNames) { featureTypeName => + val ft: FeatureTypeSparkConverter[_ <: FeatureType] = + FeatureTypeSparkConverter.fromFeatureTypeName(featureTypeName) + assertCreate(ft.fromSpark(null)) + } + } + property("error on making a converter on no existent feature type name") { + forAll(bogusNames) { bogusName => + intercept[IllegalArgumentException]( + FeatureTypeSparkConverter.fromFeatureTypeName(bogusName) + ).getMessage shouldBe s"Unknown feature type '$bogusName'" + } + } property("create a feature type instance of null") { forAll(featureTypeConverters)(ft => assertCreate(ft.fromSpark(null))) } - property("create a feature type instance in a timely fashion") { + property("create a feature type instance of null and back") { + forAll(featureTypeConverters) { ft => + assertCreate(ft.fromSpark(null), (v: FeatureType) => { + ft.asInstanceOf[FeatureTypeSparkConverter[FeatureType]].toSpark(v) shouldBe (null: Any) + FeatureTypeSparkConverter.toSpark(v) shouldBe (null: Any) + }) + } + } + property("create a feature type instance and back in a timely fashion") { forAllConcurrentCheck[FeatureTypeSparkConverter[_ <: FeatureType]]( numThreads = 10, numInstancesPerThread = 50000, atMost = 10.seconds, table = featureTypeConverters, - functionCheck = ft => assertCreate(ft.fromSpark(null)) + functionCheck = ft => { + assertCreate(ft.fromSpark(null), (v: FeatureType) => { + ft.asInstanceOf[FeatureTypeSparkConverter[FeatureType]].toSpark(v) shouldBe (null: Any) + }) + } ) } } diff --git a/features/src/test/scala/com/salesforce/op/features/types/FeatureTypeValueTest.scala b/features/src/test/scala/com/salesforce/op/features/types/FeatureTypeValueTest.scala index 6f8bb73c89..ca6c960ec6 100644 --- a/features/src/test/scala/com/salesforce/op/features/types/FeatureTypeValueTest.scala +++ b/features/src/test/scala/com/salesforce/op/features/types/FeatureTypeValueTest.scala @@ -32,6 +32,7 @@ package com.salesforce.op.features.types import com.salesforce.op.test.TestCommon +import com.salesforce.op.utils.reflection.ReflectionUtils import org.apache.lucene.geo.GeoUtils import org.apache.spark.ml.linalg.DenseVector import org.junit.runner.RunWith @@ -237,8 +238,8 @@ class FeatureTypeValueTest extends PropSpec with PropertyChecks with TestCommon * @tparam FT feature type (OP type) */ private def checkTypeTags[FT <: FeatureType](implicit vtt: TypeTag[FT#Value]): Assertion = { - withClue(s"Feature value type ${vtt.tpe} (dealised: ${vtt.tpe.dealias})") { - val tt = Try(FeatureType.featureValueTypeTag(vtt.tpe.dealias.toString)) + withClue(s"Feature value type ${vtt.tpe} (dealised: ${ReflectionUtils.dealisedTypeName(vtt.tpe)}): ") { + val tt = Try(FeatureType.featureValueTypeTag(ReflectionUtils.dealisedTypeName(vtt.tpe))) if (tt.isFailure) fail(tt.failed.get) tt.get.tpe =:= vtt.tpe shouldBe true FeatureType.isFeatureValueType(vtt) shouldBe true diff --git a/core/src/test/scala/com/salesforce/op/stages/base/binary/BinaryEstimatorTest.scala b/features/src/test/scala/com/salesforce/op/stages/base/binary/BinaryEstimatorTest.scala similarity index 54% rename from core/src/test/scala/com/salesforce/op/stages/base/binary/BinaryEstimatorTest.scala rename to features/src/test/scala/com/salesforce/op/stages/base/binary/BinaryEstimatorTest.scala index e2aa10227b..4a939ba2d0 100644 --- a/core/src/test/scala/com/salesforce/op/stages/base/binary/BinaryEstimatorTest.scala +++ b/features/src/test/scala/com/salesforce/op/stages/base/binary/BinaryEstimatorTest.scala @@ -32,22 +32,18 @@ package com.salesforce.op.stages.base.binary import com.salesforce.op.UID -import com.salesforce.op.features.Feature import com.salesforce.op.features.types._ -import com.salesforce.op.test.{TestFeatureBuilder, TestSparkContext} -import com.salesforce.op.utils.spark.RichDataset._ +import com.salesforce.op.test.{OpEstimatorSpec, TestFeatureBuilder} import org.apache.spark.ml.linalg.Vectors -import org.apache.spark.ml.param.ParamMap import org.apache.spark.sql.Dataset import org.junit.runner.RunWith import org.scalatest.junit.JUnitRunner -import org.scalatest.{Assertions, FlatSpec} - @RunWith(classOf[JUnitRunner]) -class BinaryEstimatorTest extends FlatSpec with TestSparkContext with Assertions { +class BinaryEstimatorTest + extends OpEstimatorSpec[OPVector, BinaryModel[Text, Text, OPVector], BinaryEstimator[Text, Text, OPVector]] { - val (ds, city, country) = TestFeatureBuilder("city", "country", + val (inputData, city, country) = TestFeatureBuilder("city", "country", Seq( (Text("San Francisco"), Text("USA")), (Text("Paris"), Text("France")), @@ -59,59 +55,21 @@ class BinaryEstimatorTest extends FlatSpec with TestSparkContext with Assertions ) ) - val testEstimator: BinaryEstimator[Text, Text, OPVector] = new TestPivotEstimator() - - Spec[BinaryEstimator[_, _, _]] should "throw an error if you try to get the output without setting the inputs" in { - intercept[java.util.NoSuchElementException](testEstimator.getOutput()) - } - - it should "return a single output feature of the correct type" in { - val outputFeatures = testEstimator.setInput(city, country).getOutput() - outputFeatures shouldBe new Feature[OPVector]( - name = testEstimator.getOutputFeatureName, - originStage = testEstimator, - isResponse = false, - parents = Array(city, country) - ) - } - - it should "return a BinaryModel with the estimator as the parent and the correct function" in { - val testModel = testEstimator.setInput(city, country).fit(ds) - - testModel.parent shouldBe testEstimator - testModel.transformFn(Text("San Francisco"), Text("USA")) shouldBe Vectors.dense(1, 0).toOPVector - } - - - it should "create a BinaryModel that uses the specified transform function when fit" in { - val testModel = testEstimator.setInput(city, country).fit(ds) - val testDataTransformed = testModel.setInput(city, country).transform(ds) - val outputFeatures = testEstimator.getOutput() - val transformedValues = testDataTransformed.collect(city, country, outputFeatures).toList + val estimator = new TestPivotEstimator().setInput(city, country) - testDataTransformed.schema.fields.map(_.name).toSet shouldEqual Set(city.name, country.name, outputFeatures.name) + val expectedResult = Seq( + Vectors.dense(1.0, 0.0), + Vectors.dense(0.0, 1.0), + Vectors.dense(0.0, 1.0), + Vectors.dense(1.0, 0.0), + Vectors.dense(0.0, 1.0), + Vectors.dense(0.0, 1.0), + Vectors.dense(0.0, 1.0) + ).map(_.toOPVector) - transformedValues.toSet shouldEqual Set( - (Text("San Francisco"), Text("USA"), Vectors.dense(1.0, 0.0).toOPVector), - (Text("Paris"), Text("France"), Vectors.dense(0.0, 1.0).toOPVector), - (Text("Austin"), Text("USA"), Vectors.dense(0.0, 1.0).toOPVector), - (Text("San Francisco"), Text("USA"), Vectors.dense(1.0, 0.0).toOPVector), - (Text("Paris"), Text("USA"), Vectors.dense(0.0, 1.0).toOPVector), - (Text("Puerto Arenas"), Text("Chile"), Vectors.dense(0.0, 1.0).toOPVector), - (Text("Iquitos"), Text(None), Vectors.dense(0.0, 1.0).toOPVector) - ) - - } - - it should "copy itself and the model successfully" in { - val est = new TestPivotEstimator() - val mod = new TestPivotModel("", est.operationName, est.uid) - - est.copy(new ParamMap()).uid shouldBe est.uid - mod.copy(new ParamMap()).uid shouldBe mod.uid - } } + class TestPivotEstimator(uid: String = UID[TestPivotEstimator]) extends BinaryEstimator[Text, Text, OPVector](operationName = "pivot", uid = uid) { diff --git a/core/src/test/scala/com/salesforce/op/stages/base/quaternary/QuaternaryTransformerTest.scala b/features/src/test/scala/com/salesforce/op/stages/base/binary/BinaryTransformerTest.scala similarity index 70% rename from core/src/test/scala/com/salesforce/op/stages/base/quaternary/QuaternaryTransformerTest.scala rename to features/src/test/scala/com/salesforce/op/stages/base/binary/BinaryTransformerTest.scala index 14252cfdbb..601048b393 100644 --- a/core/src/test/scala/com/salesforce/op/stages/base/quaternary/QuaternaryTransformerTest.scala +++ b/features/src/test/scala/com/salesforce/op/stages/base/binary/BinaryTransformerTest.scala @@ -29,24 +29,24 @@ * POSSIBILITY OF SUCH DAMAGE. */ -package com.salesforce.op.stages.base.quaternary +package com.salesforce.op.stages.base.binary -import com.salesforce.op.features.types.Text -import com.salesforce.op.test._ -import org.apache.spark.ml.param.ParamMap +import com.salesforce.op.features.types._ +import com.salesforce.op.test.{OpTransformerSpec, TestFeatureBuilder} import org.junit.runner.RunWith import org.scalatest.junit.JUnitRunner -import org.scalatest.{Assertions, FlatSpec, Matchers} + @RunWith(classOf[JUnitRunner]) -class QuaternaryTransformerTest extends FlatSpec with TestCommon { - - Spec[QuaternaryLambdaTransformer[_, _, _, _ , _]] should "copy successfully" in { - val tr = new QuaternaryLambdaTransformer[Text, Text, Text, Text, Text]( - operationName = "foo", - transformFn = (x, y, z, u) => x - ) - tr.copy(new ParamMap()).uid shouldBe tr.uid - } +class BinaryTransformerTest extends OpTransformerSpec[Real, BinaryTransformer[Real, RealNN, Real]] { + + val sample = Seq(Real(1.0) -> RealNN(0.0), Real(2.0) -> RealNN(2.0), Real.empty -> RealNN(1.0)) + val (inputData, f1, f2) = TestFeatureBuilder(sample) + + val transformer = new BinaryLambdaTransformer[Real, RealNN, Real](operationName = "bmi", + transformFn = (i1, i2) => new Real(for { v1 <- i1.value; v2 <- i2.value } yield v1 / (v2 * v2)) + ).setInput(f1, f2) + + val expectedResult = Seq(Real(Double.PositiveInfinity), Real(0.5), Real.empty) } diff --git a/core/src/test/scala/com/salesforce/op/stages/base/quaternary/QuaternaryEstimatorTest.scala b/features/src/test/scala/com/salesforce/op/stages/base/quaternary/QuaternaryEstimatorTest.scala similarity index 58% rename from core/src/test/scala/com/salesforce/op/stages/base/quaternary/QuaternaryEstimatorTest.scala rename to features/src/test/scala/com/salesforce/op/stages/base/quaternary/QuaternaryEstimatorTest.scala index 8912148736..08e4a72064 100644 --- a/core/src/test/scala/com/salesforce/op/stages/base/quaternary/QuaternaryEstimatorTest.scala +++ b/features/src/test/scala/com/salesforce/op/stages/base/quaternary/QuaternaryEstimatorTest.scala @@ -32,76 +32,34 @@ package com.salesforce.op.stages.base.quaternary import com.salesforce.op.UID -import com.salesforce.op.features.Feature import com.salesforce.op.features.types._ -import com.salesforce.op.test.PassengerSparkFixtureTest -import com.salesforce.op.utils.spark.RichDataset._ -import org.apache.spark.ml.param.ParamMap +import com.salesforce.op.test.{OpEstimatorSpec, TestFeatureBuilder} import org.apache.spark.sql.Dataset -import org.apache.spark.sql.types._ import org.junit.runner.RunWith -import org.scalatest.FlatSpec import org.scalatest.junit.JUnitRunner @RunWith(classOf[JUnitRunner]) -class QuaternaryEstimatorTest extends FlatSpec with PassengerSparkFixtureTest { - - var testEstimator: QuaternaryEstimator[Real, TextMap, BinaryMap, MultiPickList, Real] = new FantasticFourEstimator() - - Spec[QuaternaryEstimator[_, _, _, _, _]] should - "throw an error if you try to get the output without setting the inputs" in { - intercept[java.util.NoSuchElementException](testEstimator.getOutput()) - } - - it should "return a single output feature of the correct type" in { - val outputFeatures = testEstimator.setInput(age, stringMap, booleanMap, gender).getOutput() - outputFeatures shouldBe new Feature[Real]( - name = testEstimator.getOutputFeatureName, - originStage = testEstimator, - isResponse = false, - parents = Array(age, stringMap, booleanMap, gender) - ) - - } - - it should "create a TernaryModel that uses the specified transform function when fit" in { - val testModel = testEstimator.setInput(age, stringMap, booleanMap, gender).fit(passengersDataSet) - val testDataTransformed = testModel.setInput(age, stringMap, booleanMap, gender) - .transform(passengersDataSet.select(age.name, stringMap.name, booleanMap.name, gender.name)) - - testDataTransformed.schema shouldEqual StructType( - Seq(StructField(age.name, DoubleType, true), - StructField(stringMap.name, MapType(StringType, StringType, true), true), - StructField(booleanMap.name, MapType(StringType, BooleanType, true), true), - StructField(gender.name, ArrayType(StringType, true), true), - StructField(testEstimator.getOutputFeatureName, DoubleType, true) - ) +class QuaternaryEstimatorTest + extends OpEstimatorSpec[Real, + QuaternaryModel[Real, TextMap, BinaryMap, MultiPickList, Real], + QuaternaryEstimator[Real, TextMap, BinaryMap, MultiPickList, Real]] { + + val (inputData, reals, textMap, booleanMap, binary) = TestFeatureBuilder( + Seq( + (Real.empty, TextMap(Map("a" -> "keen")), BinaryMap(Map("a" -> true)), MultiPickList(Set("a"))), + (Real(15.0), TextMap(Map("b" -> "bok")), BinaryMap(Map("b" -> true)), MultiPickList(Set("b"))), + (Real(23.0), TextMap(Map("c" -> "bar")), BinaryMap(Map("c" -> true)), MultiPickList(Set("c"))), + (Real(40.0), TextMap(Map.empty), BinaryMap(Map("d" -> true)), MultiPickList(Set("d"))), + (Real(65.0), TextMap(Map("e" -> "B")), BinaryMap(Map("e" -> true)), MultiPickList(Set("e"))) ) + ) - val expected = Array( - Real(13.833333333333336), - Real(None), - Real(-3.1666666666666643), - Real(-34.166666666666664), - Real(None), - Real(-4.166666666666664) - ) - - testDataTransformed.collect(testModel.getOutput()) shouldEqual expected - } - - it should "copy itself and the model successfully" in { - val est = new FantasticFourEstimator() - val mod = new FantasticFourModel(0.0, est.operationName, est.uid) - - est.copy(new ParamMap()).uid shouldBe est.uid - mod.copy(new ParamMap()).uid shouldBe mod.uid - } + val estimator = new FantasticFourEstimator().setInput(reals, textMap, booleanMap, binary) + val expectedResult = Seq(Real.empty, Real(-31.6), Real(-23.6), Real.empty, Real(18.4)) } - class FantasticFourEstimator(uid: String = UID[FantasticFourEstimator]) extends QuaternaryEstimator[Real, TextMap, BinaryMap, MultiPickList, Real](operationName = "fantasticFour", uid = uid) with FantasticFour { diff --git a/features/src/test/scala/com/salesforce/op/stages/base/quaternary/QuaternaryTransformerTest.scala b/features/src/test/scala/com/salesforce/op/stages/base/quaternary/QuaternaryTransformerTest.scala new file mode 100644 index 0000000000..1f7ba68c62 --- /dev/null +++ b/features/src/test/scala/com/salesforce/op/stages/base/quaternary/QuaternaryTransformerTest.scala @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2017, Salesforce.com, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of Salesforce.com nor the names of its contributors may + * be used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +package com.salesforce.op.stages.base.quaternary + +import com.salesforce.op.features.types._ +import com.salesforce.op.test._ +import org.junit.runner.RunWith +import org.scalatest.junit.JUnitRunner + + +@RunWith(classOf[JUnitRunner]) +class QuaternaryTransformerTest + extends OpTransformerSpec[Real, QuaternaryTransformer[Real, Integral, Text, Binary, Real]] { + + val sample = Seq( + (Real(1.0), Integral(0), Text("abc"), Binary(false)), + (Real(2.0), Integral(2), Text("a"), Binary(true)), + (Real.empty, Integral(3), Text("abcdefg"), Binary(true)) + ) + + val (inputData, f1, f2, f3, f4) = TestFeatureBuilder(sample) + + val transformer = new QuaternaryLambdaTransformer[Real, Integral, Text, Binary, Real](operationName = "quatro", + transformFn = (r, i, t, b) => + (r.v.getOrElse(0.0) + i.toDouble.getOrElse(0.0) + b.toDouble.getOrElse(0.0) + + t.value.map(_.length.toDouble).getOrElse(0.0)).toReal + ).setInput(f1, f2, f3, f4) + + val expectedResult = Seq(4.toReal, 6.toReal, 11.toReal) + +} diff --git a/core/src/test/scala/com/salesforce/op/stages/base/sequence/SequenceEstimatorTest.scala b/features/src/test/scala/com/salesforce/op/stages/base/sequence/SequenceEstimatorTest.scala similarity index 55% rename from core/src/test/scala/com/salesforce/op/stages/base/sequence/SequenceEstimatorTest.scala rename to features/src/test/scala/com/salesforce/op/stages/base/sequence/SequenceEstimatorTest.scala index 68b5218aec..f4800f72b1 100644 --- a/core/src/test/scala/com/salesforce/op/stages/base/sequence/SequenceEstimatorTest.scala +++ b/features/src/test/scala/com/salesforce/op/stages/base/sequence/SequenceEstimatorTest.scala @@ -32,23 +32,20 @@ package com.salesforce.op.stages.base.sequence import com.salesforce.op.UID -import com.salesforce.op.features.Feature import com.salesforce.op.features.types._ -import com.salesforce.op.test.{TestFeatureBuilder, TestSparkContext} -import com.salesforce.op.utils.spark.RichDataset._ +import com.salesforce.op.test.{OpEstimatorSpec, TestFeatureBuilder} import com.salesforce.op.utils.spark.SequenceAggregators import org.apache.spark.ml.linalg.Vectors -import org.apache.spark.ml.param.ParamMap import org.apache.spark.sql.Dataset import org.junit.runner.RunWith -import org.scalatest.FlatSpec import org.scalatest.junit.JUnitRunner @RunWith(classOf[JUnitRunner]) -class SequenceEstimatorTest extends FlatSpec with TestSparkContext { +class SequenceEstimatorTest + extends OpEstimatorSpec[OPVector, SequenceModel[DateList, OPVector], SequenceEstimator[DateList, OPVector]] { - val data = Seq[(DateList, DateList, DateList)]( + val sample = Seq[(DateList, DateList, DateList)]( (new DateList(1476726419000L, 1476726019000L), new DateList(1476726919000L), new DateList(1476726519000L)), @@ -59,60 +56,18 @@ class SequenceEstimatorTest extends FlatSpec with TestSparkContext { new DateList(1476728919000L), new DateList(1476726619000L, 1476726949000L)) ) - val (ds, clicks, opens, purchases) = TestFeatureBuilder("clicks", "opens", "purchases", data) + val (inputData, clicks, opens, purchases) = TestFeatureBuilder("clicks", "opens", "purchases", sample) - val testEstimator: SequenceEstimator[DateList, OPVector] = new FractionOfResponsesEstimator() + val estimator = new FractionOfResponsesEstimator().setInput(clicks, opens, purchases) - Spec[SequenceEstimator[_, _]] should "throw an error if you try to get the output without setting the inputs" in { - intercept[java.util.NoSuchElementException](testEstimator.getOutput()) - } - - it should "return a single output feature of the correct type" in { - val outputFeatures = testEstimator.setInput(clicks, opens, purchases).getOutput() - outputFeatures shouldBe new Feature[OPVector]( - name = testEstimator.getOutputFeatureName, - originStage = testEstimator, - isResponse = false, - parents = Array(clicks, opens, purchases) - ) - } - - it should "return a SequenceModel with the estimator as the parent and the correct function" in { - val testModel = testEstimator.setInput(clicks, opens, purchases).fit(ds) - testModel.parent shouldBe testEstimator - testModel.transformFn( - Seq(new DateList(1476726419000L), new DateList(1476726419000L), new DateList(1476726419000L)) - ) shouldEqual Vectors.dense(0.2, 0.25, 0.25).toOPVector - } - - it should "create a SequenceModel that uses the specified transform function when fit" in { - val testModel = testEstimator.setInput(clicks, opens, purchases).fit(ds) - val testDataTransformed = testModel.setInput(clicks, opens, purchases).transform(ds) - val transformedValues = testDataTransformed.collect(clicks, opens, purchases, testModel.getOutput()) - - // This is string because of vector type being private to spark ml - testDataTransformed.schema.fieldNames.toSet shouldEqual - Set(clicks.name, opens.name, purchases.name, testEstimator.getOutputFeatureName) - - val fractions = Array( - Vectors.dense(0.4, 0.25, 0.25).toOPVector, - Vectors.dense(0.4, 0.5, 0.25).toOPVector, - Vectors.dense(0.2, 0.25, 0.5).toOPVector - ) - val expected = data.zip(fractions) .map { case ((d1, d2, d3), f) => (d1, d2, d3, f)} - - transformedValues shouldBe expected - } - - it should "copy itself and the model successfully" in { - val est = new FractionOfResponsesEstimator() - val mod = new FractionOfResponsesModel(Seq.empty, est.operationName, est.uid) - - est.copy(new ParamMap()).uid shouldBe est.uid - mod.copy(new ParamMap()).uid shouldBe mod.uid - } + val expectedResult = Seq( + Vectors.dense(0.4, 0.25, 0.25).toOPVector, + Vectors.dense(0.4, 0.5, 0.25).toOPVector, + Vectors.dense(0.2, 0.25, 0.5).toOPVector + ) } + class FractionOfResponsesEstimator(uid: String = UID[FractionOfResponsesEstimator]) extends SequenceEstimator[DateList, OPVector](operationName = "fractionOfResponses", uid = uid) { def fitFn(dataset: Dataset[Seq[Seq[Long]]]): SequenceModel[DateList, OPVector] = { diff --git a/features/src/test/scala/com/salesforce/op/stages/base/sequence/SequenceTransformerTest.scala b/features/src/test/scala/com/salesforce/op/stages/base/sequence/SequenceTransformerTest.scala new file mode 100644 index 0000000000..16d531016e --- /dev/null +++ b/features/src/test/scala/com/salesforce/op/stages/base/sequence/SequenceTransformerTest.scala @@ -0,0 +1,61 @@ +/* + * Copyright (c) 2017, Salesforce.com, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of Salesforce.com nor the names of its contributors may + * be used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +package com.salesforce.op.stages.base.sequence + +import com.salesforce.op.features.types._ +import com.salesforce.op.test.{OpTransformerSpec, TestFeatureBuilder} +import org.junit.runner.RunWith +import org.scalatest.junit.JUnitRunner + + +@RunWith(classOf[JUnitRunner]) +class SequenceTransformerTest extends OpTransformerSpec[MultiPickList, SequenceTransformer[Real, MultiPickList]] { + + val sample = Seq( + 1.toReal -> 1.toReal, + (-1).toReal -> 1.toReal, + 15.toReal -> Real.empty, + 1.111.toReal -> 2.222.toReal + ) + val (inputData, f1, f2) = TestFeatureBuilder(sample) + + val transformer = new SequenceLambdaTransformer[Real, MultiPickList](operationName = "realToMultiPicklist", + transformFn = value => MultiPickList(value.flatMap(_.v.map(_.toString)).toSet) + ).setInput(f1, f2) + + val expectedResult = Seq( + Set("1.0").toMultiPickList, + Set("-1.0", "1.0").toMultiPickList, + Set("15.0").toMultiPickList, + Set("1.111", "2.222").toMultiPickList + ) +} diff --git a/features/src/test/scala/com/salesforce/op/stages/base/ternary/TernaryEstimatorTest.scala b/features/src/test/scala/com/salesforce/op/stages/base/ternary/TernaryEstimatorTest.scala new file mode 100644 index 0000000000..3a11f4db3c --- /dev/null +++ b/features/src/test/scala/com/salesforce/op/stages/base/ternary/TernaryEstimatorTest.scala @@ -0,0 +1,95 @@ +/* + * Copyright (c) 2017, Salesforce.com, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of Salesforce.com nor the names of its contributors may + * be used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +package com.salesforce.op.stages.base.ternary + +import com.salesforce.op.UID +import com.salesforce.op.features.types._ +import com.salesforce.op.test.{OpEstimatorSpec, TestFeatureBuilder} +import org.apache.spark.sql.Dataset +import org.junit.runner.RunWith +import org.scalatest.junit.JUnitRunner + + +@RunWith(classOf[JUnitRunner]) +class TernaryEstimatorTest + extends OpEstimatorSpec[Real, + TernaryModel[MultiPickList, Binary, RealMap, Real], + TernaryEstimator[MultiPickList, Binary, RealMap, Real]] { + + val (inputData, gender, numericMap, survived) = TestFeatureBuilder("gender", "numericMap", "survived", + Seq( + (MultiPickList.empty, RealMap(Map("teen" -> 1.0)), Binary(true)), + (MultiPickList(Set("teen")), RealMap(Map("teen" -> 2.0)), Binary(false)), + (MultiPickList(Set("teen")), RealMap(Map("teen" -> 3.0)), Binary(false)), + (MultiPickList(Set("adult")), RealMap(Map("adult" -> 1.0)), Binary(false)), + (MultiPickList(Set("senior")), RealMap(Map("senior" -> 1.0, "adult" -> 2.0)), Binary(false)) + ) + ) + + val estimator = new TripleInteractionsEstimator().setInput(gender, survived, numericMap) + + val expectedResult = Seq(Real.empty, Real(0.25), Real(1.25), Real(-0.75), Real(-0.75)) +} + +class TripleInteractionsEstimator(uid: String = UID[TripleInteractionsEstimator]) + extends TernaryEstimator[MultiPickList, Binary, RealMap, Real](operationName = "tripleInteractions", uid = uid) + with TripleInteractions { + + // scalastyle:off line.size.limit + def fitFn(dataset: Dataset[(MultiPickList#Value, Binary#Value, RealMap#Value)]): TernaryModel[MultiPickList, Binary, RealMap, Real] = { + import dataset.sparkSession.implicits._ + val mean = { + dataset.map { case (gndr, srvvd, nmrcMp) => + if (survivedAndMatches(gndr, srvvd, nmrcMp)) nmrcMp(gndr.head) else 0.0 + }.filter(_ != 0.0).groupBy().mean().first().getDouble(0) + } + new TripleInteractionsModel(mean = mean, operationName = operationName, uid = uid) + } + // scalastyle:on + +} + +final class TripleInteractionsModel private[op](val mean: Double, operationName: String, uid: String) + extends TernaryModel[MultiPickList, Binary, RealMap, Real](operationName = operationName, uid = uid) + with TripleInteractions { + + def transformFn: (MultiPickList, Binary, RealMap) => Real = (g: MultiPickList, s: Binary, nm: RealMap) => new Real( + if (!survivedAndMatches(g.value, s.value, nm.value)) None + else Some(nm.value(g.value.head) - mean) + ) + +} + +sealed trait TripleInteractions { + def survivedAndMatches(g: MultiPickList#Value, s: Binary#Value, nm: RealMap#Value): Boolean = + !s.getOrElse(false) && g.nonEmpty && nm.contains(g.head) +} diff --git a/features/src/test/scala/com/salesforce/op/stages/base/ternary/TernaryTransformerTest.scala b/features/src/test/scala/com/salesforce/op/stages/base/ternary/TernaryTransformerTest.scala new file mode 100644 index 0000000000..6b8f1cf527 --- /dev/null +++ b/features/src/test/scala/com/salesforce/op/stages/base/ternary/TernaryTransformerTest.scala @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2017, Salesforce.com, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of Salesforce.com nor the names of its contributors may + * be used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +package com.salesforce.op.stages.base.ternary + +import com.salesforce.op.features.types._ +import com.salesforce.op.test._ +import org.junit.runner.RunWith +import org.scalatest.junit.JUnitRunner + + +@RunWith(classOf[JUnitRunner]) +class TernaryTransformerTest extends OpTransformerSpec[Real, TernaryTransformer[Real, Integral, Binary, Real]] { + + val sample = Seq( + (Real(1.0), Integral(0), Binary(false)), + (Real(2.0), Integral(2), Binary(true)), + (Real.empty, Integral(3), Binary(true)) + ) + + val (inputData, f1, f2, f3) = TestFeatureBuilder(sample) + + val transformer = new TernaryLambdaTransformer[Real, Integral, Binary, Real](operationName = "trio", + transformFn = (r, i, b) => (r.v.getOrElse(0.0) + i.toDouble.getOrElse(0.0) + b.toDouble.getOrElse(0.0)).toReal + ).setInput(f1, f2, f3) + + val expectedResult = Seq(1.toReal, 5.toReal, 4.toReal) + +} diff --git a/core/src/test/scala/com/salesforce/op/stages/base/binary/BinaryTransformerTest.scala b/features/src/test/scala/com/salesforce/op/stages/base/unary/UnaryEstimatorTest.scala similarity index 54% rename from core/src/test/scala/com/salesforce/op/stages/base/binary/BinaryTransformerTest.scala rename to features/src/test/scala/com/salesforce/op/stages/base/unary/UnaryEstimatorTest.scala index 27d2f04005..570c3c0270 100644 --- a/core/src/test/scala/com/salesforce/op/stages/base/binary/BinaryTransformerTest.scala +++ b/features/src/test/scala/com/salesforce/op/stages/base/unary/UnaryEstimatorTest.scala @@ -29,51 +29,53 @@ * POSSIBILITY OF SUCH DAMAGE. */ -package com.salesforce.op.stages.base.binary +package com.salesforce.op.stages.base.unary -import com.salesforce.op.test.PassengerSparkFixtureTest +import com.salesforce.op.UID import com.salesforce.op.features.Feature import com.salesforce.op.features.types._ +import com.salesforce.op.test.{OpEstimatorSpec, TestFeatureBuilder, TestSparkContext} import com.salesforce.op.utils.spark.RichDataset._ -import com.salesforce.op.utils.spark.RichRow._ import org.apache.spark.ml.param.ParamMap +import org.apache.spark.sql.Dataset +import org.apache.spark.sql.types.{DoubleType, MetadataBuilder, StructField, StructType} import org.junit.runner.RunWith +import org.scalatest.FlatSpec import org.scalatest.junit.JUnitRunner -import org.scalatest.{FlatSpec, Matchers} @RunWith(classOf[JUnitRunner]) -class BinaryTransformerTest extends FlatSpec with PassengerSparkFixtureTest { +class UnaryEstimatorTest extends OpEstimatorSpec[Real, UnaryModel[Real, Real], UnaryEstimator[Real, Real]] { - val bmi = new BinaryLambdaTransformer[Real, RealNN, Real](operationName = "bmi", - transformFn = (i1, i2) => new Real(for { v1 <- i1.value; v2 <- i2.value } yield v1 / (v2 * v2)) - ).setInput(weight, height) + /** + * Input Dataset to fit & transform + */ + val (inputData, f1) = TestFeatureBuilder(Seq(1.0, 5.0, 3.0, 2.0, 6.0).toReal) - Spec[BinaryLambdaTransformer[_, _, _]] should "return single properly formed Feature" in { - val feats = bmi.getOutput() + /** + * Estimator instance to be tested + */ + val estimator = new MinMaxNormEstimator().setInput(f1) - feats shouldBe new Feature[Real]( - name = bmi.getOutputFeatureName, - originStage = bmi, - isResponse = false, - parents = Array(weight, height) - ) - } + /** + * Expected result of the transformer applied on the Input Dataset + */ + val expectedResult = Seq(0.0, 0.8, 0.4, 0.2, 1.0).map(_.toReal) - it should "add column to DataFrame when transformed" in { - val transformedData = bmi.transform(passengersDataSet) - val columns = transformedData.columns - assert(columns.contains(bmi.getOutputFeatureName)) - val output = bmi.getOutput() - val answer = passengersArray.map(r => - bmi.transformFn(r.getFeatureType[Real](weight), r.getFeatureType[RealNN](height)) - ) - transformedData.collect(output) shouldBe answer - } +} - it should "copy successfully" in { - val copy = bmi.copy(new ParamMap()) - copy.isInstanceOf[BinaryTransformer[_, _, _]] shouldBe true - copy.uid shouldBe bmi.uid +class MinMaxNormEstimator(uid: String = UID[MinMaxNormEstimator]) + extends UnaryEstimator[Real, Real](operationName = "minMaxNorm", uid = uid) { + + def fitFn(dataset: Dataset[Real#Value]): UnaryModel[Real, Real] = { + val grouped = dataset.groupBy() + val maxVal = grouped.max().first().getDouble(0) + val minVal = grouped.min().first().getDouble(0) + new MinMaxNormEstimatorModel(min = minVal, max = maxVal, operationName = operationName, uid = uid) } } + +final class MinMaxNormEstimatorModel private[op](val min: Double, val max: Double, operationName: String, uid: String) + extends UnaryModel[Real, Real](operationName = operationName, uid = uid) { + def transformFn: Real => Real = _.v.map(v => (v - min) / (max - min)).toReal +} diff --git a/core/src/main/scala/org/apache/spark/ml/classification/OpLogisticRegressionModel.scala b/features/src/test/scala/com/salesforce/op/stages/base/unary/UnaryTransformerTest.scala similarity index 66% rename from core/src/main/scala/org/apache/spark/ml/classification/OpLogisticRegressionModel.scala rename to features/src/test/scala/com/salesforce/op/stages/base/unary/UnaryTransformerTest.scala index e2707fb3b0..a4108318ea 100644 --- a/core/src/main/scala/org/apache/spark/ml/classification/OpLogisticRegressionModel.scala +++ b/features/src/test/scala/com/salesforce/op/stages/base/unary/UnaryTransformerTest.scala @@ -29,26 +29,33 @@ * POSSIBILITY OF SUCH DAMAGE. */ -package org.apache.spark.ml.classification +package com.salesforce.op.stages.base.unary -import com.salesforce.op.UID import com.salesforce.op.features.types._ -import org.apache.spark.ml.linalg.{Matrix, Vector} - -import scala.reflect.runtime.universe.TypeTag - -class OpLogisticRegressionModel -( - coefficientMatrix: Matrix, - interceptVector: Vector, - numClasses: Int, - val isMultinomial: Boolean, - val operationName: String = "opLR", - uid: String = UID[OpLogisticRegressionModel] -)( - implicit val tti1: TypeTag[RealNN], - val tti2: TypeTag[OPVector], - val tto: TypeTag[Prediction], - val ttov: TypeTag[Prediction#Value] -) extends LogisticRegressionModel(uid = uid, coefficientMatrix = coefficientMatrix, - interceptVector = interceptVector, numClasses = numClasses, isMultinomial = isMultinomial) with OpClassifierModelBase +import com.salesforce.op.test.{OpTransformerSpec, TestFeatureBuilder} +import org.junit.runner.RunWith +import org.scalatest.junit.JUnitRunner + + +@RunWith(classOf[JUnitRunner]) +class UnaryTransformerTest extends OpTransformerSpec[Real, UnaryLambdaTransformer[Real, Real]] { + + /** + * Input Dataset to transform + */ + val (inputData, f1) = TestFeatureBuilder(Seq(Some(1), Some(2), Some(3), None).map(_.toReal)) + + /** + * [[OpTransformer]] instance to be tested + */ + val transformer = new UnaryLambdaTransformer[Real, Real]( + operationName = "unary", + transformFn = r => r.v.map(_ * 2.0).toReal + ).setInput(f1) + + /** + * Expected result of the transformer applied on the Input Dataset + */ + val expectedResult = Seq(Real(2), Real(4), Real(6), Real.empty) + +} diff --git a/utils/src/test/scala/com/salesforce/op/utils/spark/OPVectorMetadataTest.scala b/features/src/test/scala/com/salesforce/op/utils/spark/OPVectorMetadataTest.scala similarity index 93% rename from utils/src/test/scala/com/salesforce/op/utils/spark/OPVectorMetadataTest.scala rename to features/src/test/scala/com/salesforce/op/utils/spark/OPVectorMetadataTest.scala index 3e44b3b09d..78577b79ea 100644 --- a/utils/src/test/scala/com/salesforce/op/utils/spark/OPVectorMetadataTest.scala +++ b/features/src/test/scala/com/salesforce/op/utils/spark/OPVectorMetadataTest.scala @@ -32,6 +32,7 @@ package com.salesforce.op.utils.spark import com.salesforce.op.FeatureHistory +import com.salesforce.op.features.types.{DateTime, Email, FeatureType, OPMap, PickList, Prediction, Real, RealMap, TextAreaMap} import com.salesforce.op.test.TestCommon import org.apache.spark.sql.types.Metadata import org.junit.runner.RunWith @@ -51,13 +52,19 @@ class OPVectorMetadataTest extends PropSpec with TestCommon with PropertyChecks // AttributeGroup and Attribute require non-empty names val genName: Gen[String] = Gen.nonEmptyListOf(alphaNumChar).map(_.mkString) + val genType: Gen[String] = Gen.oneOf( + FeatureType.typeName[DateTime], FeatureType.typeName[Email], FeatureType.typeName[PickList], + FeatureType.typeName[Prediction], FeatureType.typeName[Real], FeatureType.typeName[RealMap], + FeatureType.typeName[TextAreaMap] + ) val genValue: Gen[String] = Gen.oneOf(genName, Gen.oneOf(Seq(OpVectorColumnMetadata.NullString))) val vecColTupleGen: Gen[OpVectorColumnTuple] = for { - seq <- Gen.containerOf[Seq, String](genName) + nameSeq <- Gen.containerOf[Seq, String](genName) + typeSeq <- Gen.listOfN(nameSeq.length, genType) group <- Gen.option(genName) value <- Gen.option(genValue) } yield { - (seq, seq, group, value, 0) + (nameSeq, typeSeq, group, value, 0) } val featHistTupleGen: Gen[FeatureHistoryTuple] = Gen.zip( @@ -181,8 +188,8 @@ class OPVectorMetadataTest extends PropSpec with TestCommon with PropertyChecks hist.indicatorValue shouldBe meta.indicatorValue hist.indicatorGroup shouldBe meta.indicatorGroup hist.indicatorValue.contains(OpVectorColumnMetadata.NullString) shouldBe meta.isNullIndicator - hist.parentFeatureType.foreach(p => p.contains(p) shouldBe meta.hasParentOfType(p)) - hist.parentFeatureType.exists(p => p.contains("Map") || p.contains("Prediction")) shouldBe meta.hasMapParent() + hist.parentFeatureType.exists(p => p.contains("Map") || p.contains("Prediction")) shouldBe + meta.hasParentOfSubType[OPMap[_]] } if (colHist.nonEmpty && colHist.head.parentFeatureName.nonEmpty) { colHist.head.parentFeatureName.flatMap(p => history(p).stages).distinct.sorted should diff --git a/utils/src/test/scala/com/salesforce/op/utils/spark/RichStructTypeTest.scala b/features/src/test/scala/com/salesforce/op/utils/spark/RichStructTypeTest.scala similarity index 100% rename from utils/src/test/scala/com/salesforce/op/utils/spark/RichStructTypeTest.scala rename to features/src/test/scala/com/salesforce/op/utils/spark/RichStructTypeTest.scala diff --git a/features/src/test/scala/com/salesforce/op/utils/spark/RichVectorTest.scala b/features/src/test/scala/com/salesforce/op/utils/spark/RichVectorTest.scala new file mode 100644 index 0000000000..cb4a4bd616 --- /dev/null +++ b/features/src/test/scala/com/salesforce/op/utils/spark/RichVectorTest.scala @@ -0,0 +1,155 @@ +/* + * Copyright (c) 2017, Salesforce.com, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of Salesforce.com nor the names of its contributors may + * be used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +package com.salesforce.op.utils.spark + +import com.holdenkarau.spark.testing.RDDGenerator +import com.salesforce.op.features.types.ConcurrentCheck +import com.salesforce.op.test.TestSparkContext +import com.twitter.algebird.Monoid +import org.apache.spark.ml.linalg.{SparseVector, Vector, Vectors} +import org.apache.spark.rdd.RDD +import org.junit.runner.RunWith +import org.scalacheck.Gen +import org.scalactic.TolerantNumerics +import org.scalatest.PropSpec +import org.scalatest.junit.JUnitRunner +import org.scalatest.prop.PropertyChecks + +import scala.concurrent.duration._ + + +@RunWith(classOf[JUnitRunner]) +class RichVectorTest extends PropSpec with PropertyChecks with TestSparkContext with ConcurrentCheck { + + import VectorGenerators._ + import com.salesforce.op.utils.spark.RichVector._ + + lazy val sparseVevtorsRDDGen = RDDGenerator.genRDD[Vector](spark.sparkContext)(sparseVectorGen) + + property("Vectors should error on size mismatch") { + forAll(sparseVectorGen) { sparse: SparseVector => + val wrongSize = Vectors.sparse(sparse.size + 1, Array(0), Array(1.0)) + val dense = sparse.toDense + for { + res <- Seq( + () => sparse + wrongSize, + () => sparse - wrongSize, + () => dense + wrongSize, + () => dense - wrongSize, + () => dense + wrongSize.toDense, + () => dense - wrongSize.toDense + ) + } { + intercept[IllegalArgumentException](res()).getMessage should { + startWith("requirement failed: Vectors must") and include("same length") + } + } + } + } + + property("Vectors should '+' add correctly") { + forAll(sparseVectorGen) { sparse: SparseVector => + val expected = sparse.toArray.map(_ * 2) + val dense = sparse.toDense + for {res <- Seq(sparse + sparse, dense + sparse, sparse + dense, dense + dense)} { + res.size shouldBe sparse.size + res.toArray should contain theSameElementsAs expected + } + } + } + + property("Vectors should '-' subtract correctly") { + forAll(sparseVectorGen) { sparse: SparseVector => + val dense = sparse.toDense + for {res <- Seq(sparse - sparse, dense - sparse, sparse - dense, dense - dense)} { + res.size shouldBe sparse.size + res.toArray.foreach(_ shouldBe 0.0) + } + } + } + + property("Vectors convert to breeze vectors correctly") { + forAll(sparseVectorGen) { sparse: SparseVector => + val dense = sparse.toDense + sparse.toBreeze.toArray should contain theSameElementsAs dense.toBreeze.toArray + } + } + + property("Sparse vectors should '+' add efficiently") { + val sparseSize = 100000000 + val sparse = new SparseVector(sparseSize, Array(0, 1, sparseSize - 1), Array(-1.0, 1.0, 3.0)) + val expected = new SparseVector(sparseSize, Array(0, 1, sparseSize - 1), Array(-2.0, 2.0, 6.0)) + + forAllConcurrentCheck[SparseVector]( + numThreads = 10, numInstancesPerThread = 100000, atMost = 10.seconds, + table = Table[SparseVector]("sparseVectors", sparse), + functionCheck = sparse => { + val res = sparse + sparse + res shouldBe a[SparseVector] + res shouldEqual expected + } + ) + } + + property("Vectors add in reduce") { + forAll(sparseVevtorsRDDGen) { rdd: RDD[Vector] => + if (!rdd.isEmpty()) { + val tolerance = 1e-9 // we are loosing precision here, hence the tolerance + implicit val doubleEq = TolerantNumerics.tolerantDoubleEquality(tolerance) + + val expected = rdd.map(_.toArray).reduce(Monoid.arrayMonoid[Double].plus) + for { + res <- Seq( + () => rdd.reduce(_ + _), + () => rdd.reduce(_.toDense + _), + () => rdd.reduce(_ + _.toDense), + () => rdd.reduce(_.toDense + _.toDense) + ) + (v, exp) <- res().toArray.zip(expected) + } v shouldEqual exp + } + } + } + +} + +object VectorGenerators { + + val size = 100 + + val sparseVectorGen = for { + indices <- Gen.listOfN(size, Gen.choose(0, size - 1)) + values <- Gen.listOfN(size, Gen.choose(-100000.0, 100000.0).filter(!_.isNaN)) + idx = indices.distinct.sorted.toArray + } yield new SparseVector(size, idx, values.toArray.take(idx.length)) + +} diff --git a/gradle.properties b/gradle.properties index 1363c922c9..3ee2b49445 100644 --- a/gradle.properties +++ b/gradle.properties @@ -1,2 +1,2 @@ -version=3.3.1 +version=3.3.3 group=com.salesforce diff --git a/gradle/version-properties.gradle b/gradle/version-properties.gradle index ff2661e452..c879034ed9 100644 --- a/gradle/version-properties.gradle +++ b/gradle/version-properties.gradle @@ -36,4 +36,4 @@ configure(allprojects - project(':templates').subprojects - project(':docs')) { } } } -} +} \ No newline at end of file diff --git a/models/README.md b/models/README.md new file mode 100644 index 0000000000..0d15f563aa --- /dev/null +++ b/models/README.md @@ -0,0 +1,5 @@ +# Models + +This project contains all the pretrained models used in OP, e.g. OpenNLP POS/NER models etc. + +Include this project as a runtime dependency if you use any of such models in you application, otherwise it's optional. \ No newline at end of file diff --git a/models/build.gradle b/models/build.gradle new file mode 100644 index 0000000000..086e0b38a9 --- /dev/null +++ b/models/build.gradle @@ -0,0 +1,4 @@ +jar { + // Avoid compressing models, cause it's quite slow + entryCompression = ZipEntryCompression.STORED +} diff --git a/models/src/main/resources/OpenNLP/da-pos-maxent.bin b/models/src/main/resources/OpenNLP/da-pos-maxent.bin new file mode 100644 index 0000000000..8fade7ce97 Binary files /dev/null and b/models/src/main/resources/OpenNLP/da-pos-maxent.bin differ diff --git a/models/src/main/resources/OpenNLP/da-pos-perceptron.bin b/models/src/main/resources/OpenNLP/da-pos-perceptron.bin new file mode 100644 index 0000000000..baabfda3b6 Binary files /dev/null and b/models/src/main/resources/OpenNLP/da-pos-perceptron.bin differ diff --git a/models/src/main/resources/OpenNLP/da-sent.bin b/models/src/main/resources/OpenNLP/da-sent.bin new file mode 100644 index 0000000000..9913d530e7 Binary files /dev/null and b/models/src/main/resources/OpenNLP/da-sent.bin differ diff --git a/models/src/main/resources/OpenNLP/da-token.bin b/models/src/main/resources/OpenNLP/da-token.bin new file mode 100644 index 0000000000..994d07e090 Binary files /dev/null and b/models/src/main/resources/OpenNLP/da-token.bin differ diff --git a/models/src/main/resources/OpenNLP/de-pos-maxent.bin b/models/src/main/resources/OpenNLP/de-pos-maxent.bin new file mode 100644 index 0000000000..c564d56ceb Binary files /dev/null and b/models/src/main/resources/OpenNLP/de-pos-maxent.bin differ diff --git a/models/src/main/resources/OpenNLP/de-pos-perceptron.bin b/models/src/main/resources/OpenNLP/de-pos-perceptron.bin new file mode 100644 index 0000000000..c79debd6e1 Binary files /dev/null and b/models/src/main/resources/OpenNLP/de-pos-perceptron.bin differ diff --git a/models/src/main/resources/OpenNLP/de-sent.bin b/models/src/main/resources/OpenNLP/de-sent.bin new file mode 100644 index 0000000000..71d4e5ddd9 Binary files /dev/null and b/models/src/main/resources/OpenNLP/de-sent.bin differ diff --git a/models/src/main/resources/OpenNLP/de-token.bin b/models/src/main/resources/OpenNLP/de-token.bin new file mode 100644 index 0000000000..380e7ff058 Binary files /dev/null and b/models/src/main/resources/OpenNLP/de-token.bin differ diff --git a/models/src/main/resources/OpenNLP/en-chunker.bin b/models/src/main/resources/OpenNLP/en-chunker.bin new file mode 100644 index 0000000000..65d9356888 Binary files /dev/null and b/models/src/main/resources/OpenNLP/en-chunker.bin differ diff --git a/models/src/main/resources/OpenNLP/en-ner-date.bin b/models/src/main/resources/OpenNLP/en-ner-date.bin new file mode 100644 index 0000000000..a69923ac42 Binary files /dev/null and b/models/src/main/resources/OpenNLP/en-ner-date.bin differ diff --git a/models/src/main/resources/OpenNLP/en-ner-location.bin b/models/src/main/resources/OpenNLP/en-ner-location.bin new file mode 100644 index 0000000000..f3788bc1f6 Binary files /dev/null and b/models/src/main/resources/OpenNLP/en-ner-location.bin differ diff --git a/models/src/main/resources/OpenNLP/en-ner-money.bin b/models/src/main/resources/OpenNLP/en-ner-money.bin new file mode 100644 index 0000000000..2431e0f5ee Binary files /dev/null and b/models/src/main/resources/OpenNLP/en-ner-money.bin differ diff --git a/models/src/main/resources/OpenNLP/en-ner-organization.bin b/models/src/main/resources/OpenNLP/en-ner-organization.bin new file mode 100644 index 0000000000..1fb6d9fa8f Binary files /dev/null and b/models/src/main/resources/OpenNLP/en-ner-organization.bin differ diff --git a/models/src/main/resources/OpenNLP/en-ner-percentage.bin b/models/src/main/resources/OpenNLP/en-ner-percentage.bin new file mode 100644 index 0000000000..98cee1a341 Binary files /dev/null and b/models/src/main/resources/OpenNLP/en-ner-percentage.bin differ diff --git a/models/src/main/resources/OpenNLP/en-ner-person.bin b/models/src/main/resources/OpenNLP/en-ner-person.bin new file mode 100644 index 0000000000..2f68318203 Binary files /dev/null and b/models/src/main/resources/OpenNLP/en-ner-person.bin differ diff --git a/models/src/main/resources/OpenNLP/en-ner-time.bin b/models/src/main/resources/OpenNLP/en-ner-time.bin new file mode 100644 index 0000000000..a5d8aa14d8 Binary files /dev/null and b/models/src/main/resources/OpenNLP/en-ner-time.bin differ diff --git a/models/src/main/resources/OpenNLP/en-parser-chunking.bin b/models/src/main/resources/OpenNLP/en-parser-chunking.bin new file mode 100644 index 0000000000..7550609ebc Binary files /dev/null and b/models/src/main/resources/OpenNLP/en-parser-chunking.bin differ diff --git a/models/src/main/resources/OpenNLP/en-pos-maxent.bin b/models/src/main/resources/OpenNLP/en-pos-maxent.bin new file mode 100644 index 0000000000..c8cae23c5f Binary files /dev/null and b/models/src/main/resources/OpenNLP/en-pos-maxent.bin differ diff --git a/models/src/main/resources/OpenNLP/en-pos-perceptron.bin b/models/src/main/resources/OpenNLP/en-pos-perceptron.bin new file mode 100644 index 0000000000..b52903fd10 Binary files /dev/null and b/models/src/main/resources/OpenNLP/en-pos-perceptron.bin differ diff --git a/models/src/main/resources/OpenNLP/en-sent.bin b/models/src/main/resources/OpenNLP/en-sent.bin new file mode 100644 index 0000000000..e89076be5a Binary files /dev/null and b/models/src/main/resources/OpenNLP/en-sent.bin differ diff --git a/models/src/main/resources/OpenNLP/en-token.bin b/models/src/main/resources/OpenNLP/en-token.bin new file mode 100644 index 0000000000..c417277ca7 Binary files /dev/null and b/models/src/main/resources/OpenNLP/en-token.bin differ diff --git a/models/src/main/resources/OpenNLP/es-ner-location.bin b/models/src/main/resources/OpenNLP/es-ner-location.bin new file mode 100644 index 0000000000..a9a7223c78 Binary files /dev/null and b/models/src/main/resources/OpenNLP/es-ner-location.bin differ diff --git a/models/src/main/resources/OpenNLP/es-ner-misc.bin b/models/src/main/resources/OpenNLP/es-ner-misc.bin new file mode 100644 index 0000000000..d45eceedef Binary files /dev/null and b/models/src/main/resources/OpenNLP/es-ner-misc.bin differ diff --git a/models/src/main/resources/OpenNLP/es-ner-organization.bin b/models/src/main/resources/OpenNLP/es-ner-organization.bin new file mode 100644 index 0000000000..efb75ba20c Binary files /dev/null and b/models/src/main/resources/OpenNLP/es-ner-organization.bin differ diff --git a/models/src/main/resources/OpenNLP/es-ner-person.bin b/models/src/main/resources/OpenNLP/es-ner-person.bin new file mode 100644 index 0000000000..f4a15c0174 Binary files /dev/null and b/models/src/main/resources/OpenNLP/es-ner-person.bin differ diff --git a/models/src/main/resources/OpenNLP/nl-ner-location.bin b/models/src/main/resources/OpenNLP/nl-ner-location.bin new file mode 100644 index 0000000000..3cf9081f78 Binary files /dev/null and b/models/src/main/resources/OpenNLP/nl-ner-location.bin differ diff --git a/models/src/main/resources/OpenNLP/nl-ner-misc.bin b/models/src/main/resources/OpenNLP/nl-ner-misc.bin new file mode 100644 index 0000000000..cdf8144eb8 Binary files /dev/null and b/models/src/main/resources/OpenNLP/nl-ner-misc.bin differ diff --git a/models/src/main/resources/OpenNLP/nl-ner-organization.bin b/models/src/main/resources/OpenNLP/nl-ner-organization.bin new file mode 100644 index 0000000000..dd962adbc4 Binary files /dev/null and b/models/src/main/resources/OpenNLP/nl-ner-organization.bin differ diff --git a/models/src/main/resources/OpenNLP/nl-ner-person.bin b/models/src/main/resources/OpenNLP/nl-ner-person.bin new file mode 100644 index 0000000000..cd3df4efd3 Binary files /dev/null and b/models/src/main/resources/OpenNLP/nl-ner-person.bin differ diff --git a/models/src/main/resources/OpenNLP/nl-pos-maxent.bin b/models/src/main/resources/OpenNLP/nl-pos-maxent.bin new file mode 100644 index 0000000000..170e1e8b15 Binary files /dev/null and b/models/src/main/resources/OpenNLP/nl-pos-maxent.bin differ diff --git a/models/src/main/resources/OpenNLP/nl-pos-perceptron.bin b/models/src/main/resources/OpenNLP/nl-pos-perceptron.bin new file mode 100644 index 0000000000..7db9bce873 Binary files /dev/null and b/models/src/main/resources/OpenNLP/nl-pos-perceptron.bin differ diff --git a/models/src/main/resources/OpenNLP/nl-sent.bin b/models/src/main/resources/OpenNLP/nl-sent.bin new file mode 100644 index 0000000000..f212e279d5 Binary files /dev/null and b/models/src/main/resources/OpenNLP/nl-sent.bin differ diff --git a/models/src/main/resources/OpenNLP/nl-token.bin b/models/src/main/resources/OpenNLP/nl-token.bin new file mode 100644 index 0000000000..cb6190c341 Binary files /dev/null and b/models/src/main/resources/OpenNLP/nl-token.bin differ diff --git a/models/src/main/resources/OpenNLP/pt-pos-maxent.bin b/models/src/main/resources/OpenNLP/pt-pos-maxent.bin new file mode 100644 index 0000000000..12c666ac66 Binary files /dev/null and b/models/src/main/resources/OpenNLP/pt-pos-maxent.bin differ diff --git a/models/src/main/resources/OpenNLP/pt-pos-perceptron.bin b/models/src/main/resources/OpenNLP/pt-pos-perceptron.bin new file mode 100644 index 0000000000..2fe7ccf293 Binary files /dev/null and b/models/src/main/resources/OpenNLP/pt-pos-perceptron.bin differ diff --git a/models/src/main/resources/OpenNLP/pt-sent.bin b/models/src/main/resources/OpenNLP/pt-sent.bin new file mode 100644 index 0000000000..c2c537bb33 Binary files /dev/null and b/models/src/main/resources/OpenNLP/pt-sent.bin differ diff --git a/models/src/main/resources/OpenNLP/pt-token.bin b/models/src/main/resources/OpenNLP/pt-token.bin new file mode 100644 index 0000000000..0fc90a3669 Binary files /dev/null and b/models/src/main/resources/OpenNLP/pt-token.bin differ diff --git a/models/src/main/resources/OpenNLP/se-pos-maxent.bin b/models/src/main/resources/OpenNLP/se-pos-maxent.bin new file mode 100644 index 0000000000..1e4ce32ec5 Binary files /dev/null and b/models/src/main/resources/OpenNLP/se-pos-maxent.bin differ diff --git a/models/src/main/resources/OpenNLP/se-pos-perceptron.bin b/models/src/main/resources/OpenNLP/se-pos-perceptron.bin new file mode 100644 index 0000000000..572241ef70 Binary files /dev/null and b/models/src/main/resources/OpenNLP/se-pos-perceptron.bin differ diff --git a/models/src/main/resources/OpenNLP/se-sent.bin b/models/src/main/resources/OpenNLP/se-sent.bin new file mode 100644 index 0000000000..4a0b702545 Binary files /dev/null and b/models/src/main/resources/OpenNLP/se-sent.bin differ diff --git a/models/src/main/resources/OpenNLP/se-token.bin b/models/src/main/resources/OpenNLP/se-token.bin new file mode 100644 index 0000000000..d66c8709a9 Binary files /dev/null and b/models/src/main/resources/OpenNLP/se-token.bin differ diff --git a/models/src/main/resources/OpenNLP/vesion b/models/src/main/resources/OpenNLP/vesion new file mode 100644 index 0000000000..14ac82435a --- /dev/null +++ b/models/src/main/resources/OpenNLP/vesion @@ -0,0 +1,2 @@ +OpenNLP models - Version 1.5 +Downloaded from - http://opennlp.sourceforge.net/models-1.5 diff --git a/readers/src/main/scala/com/salesforce/op/test/PassengerFeaturesTest.scala b/readers/src/main/scala/com/salesforce/op/test/PassengerFeaturesTest.scala index 8602c54f7a..5a250df992 100644 --- a/readers/src/main/scala/com/salesforce/op/test/PassengerFeaturesTest.scala +++ b/readers/src/main/scala/com/salesforce/op/test/PassengerFeaturesTest.scala @@ -31,7 +31,6 @@ package com.salesforce.op.test -import com.salesforce.op.UID import com.salesforce.op.features.types._ import com.salesforce.op.features.{FeatureBuilder, OPFeature} import com.salesforce.op.utils.tuples.RichTuple._ @@ -40,8 +39,6 @@ import org.joda.time.Duration trait PassengerFeaturesTest { - UID.reset() - val age = FeatureBuilder.Real[Passenger] .extract(_.getAge.toReal) .aggregate((l, r) => (l -> r).map(breeze.linalg.max(_, _))) @@ -64,6 +61,7 @@ trait PassengerFeaturesTest { val booleanMap = FeatureBuilder.BinaryMap[Passenger].extract(p => p.getBooleanMap.toBinaryMap).asPredictor val survived = FeatureBuilder.Binary[Passenger].extract(p => Option(p.getSurvived).map(_ == 1).toBinary).asResponse val boardedTime = FeatureBuilder.Date[Passenger].extract(_.getBoarded.toLong.toDate).asPredictor + val boardedTimeAsDateTime = FeatureBuilder.DateTime[Passenger].extract(_.getBoarded.toLong.toDateTime).asPredictor val rawFeatures: Array[OPFeature] = Array( survived, age, gender, height, weight, description, boarded, stringMap, numericMap, booleanMap diff --git a/readers/src/main/scala/com/salesforce/op/test/PassengerSparkFixtureTest.scala b/readers/src/main/scala/com/salesforce/op/test/PassengerSparkFixtureTest.scala index be17156ee5..2e6802c660 100644 --- a/readers/src/main/scala/com/salesforce/op/test/PassengerSparkFixtureTest.scala +++ b/readers/src/main/scala/com/salesforce/op/test/PassengerSparkFixtureTest.scala @@ -57,13 +57,13 @@ trait PassengerSparkFixtureTest extends TestSparkContext with PassengerFeaturesT key = _.getPassengerId.toString // Entity to score ) - val simpleCsvReader = DataReaders.Simple.csv[PassengerCSV]( + lazy val simpleCsvReader = DataReaders.Simple.csv[PassengerCSV]( path = Some(passengerCsvPath), // Path should be optional so can also pass in as a parameter schema = PassengerCSV.getClassSchema.toString, // Input schema key = _.getPassengerId.toString // Entity to score ) - val simpleStreamingReader = StreamingReaders.Simple.avro[Passenger]( + lazy val simpleStreamingReader = StreamingReaders.Simple.avro[Passenger]( key = _.getPassengerId.toString // Entity to score ) diff --git a/resources/materializingdata.png b/resources/materializingdata.png new file mode 100644 index 0000000000..eab29e2a0e Binary files /dev/null and b/resources/materializingdata.png differ diff --git a/resources/stages.png b/resources/stages.png new file mode 100644 index 0000000000..49e009e3b4 Binary files /dev/null and b/resources/stages.png differ diff --git a/resources/workflows.png b/resources/workflows.png new file mode 100644 index 0000000000..5feb167e3c Binary files /dev/null and b/resources/workflows.png differ diff --git a/settings.gradle b/settings.gradle index eb42f67ae1..1771889027 100644 --- a/settings.gradle +++ b/settings.gradle @@ -1,4 +1,4 @@ rootProject.name='optimus-prime' -include 'utils', 'features', 'readers', 'core', 'testkit', 'cli', 'templates:simple', 'docs' +include 'utils', 'features', 'readers', 'core', 'models', 'testkit', 'cli', 'templates:simple', 'docs' diff --git a/test-data/DataGeneration.sc b/test-data/DataGeneration.sc index f29bbd0a4f..ac1a9e41d5 100644 --- a/test-data/DataGeneration.sc +++ b/test-data/DataGeneration.sc @@ -1,6 +1,32 @@ /* * Copyright (c) 2017, Salesforce.com, Inc. * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of Salesforce.com nor the names of its contributors may + * be used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. */ import org.apache.spark.ml.linalg.Vectors @@ -28,8 +54,8 @@ object DataGeneration { val df2 = df.toDF(names: _*) case class PassengerDG(passengerId: Int, age: Option[Int], gender: String, - height: Int, weight: Int, description: Option[String], - boarded: Long, recordDate: Long, survived: Boolean) + height: Int, weight: Int, description: Option[String], + boarded: Long, recordDate: Long, survived: Boolean) val ds = df2.as[PassengerDG] diff --git a/utils/src/main/scala/com/salesforce/op/test/TestCommon.scala b/utils/src/main/scala/com/salesforce/op/test/TestCommon.scala index ec5982ffc6..743ea787a3 100644 --- a/utils/src/main/scala/com/salesforce/op/test/TestCommon.scala +++ b/utils/src/main/scala/com/salesforce/op/test/TestCommon.scala @@ -64,7 +64,6 @@ trait TestCommon extends Matchers with Assertions { case object Spec { def apply[T: ClassTag]: String = apply(classTag[T].runtimeClass) def apply[T1: ClassTag, T2: ClassTag]: String = apply[T2] + "[" + apply[T1] + "]" - def apply(klazz: Class[_]): String = klazz.getSimpleName.stripSuffix("$") } diff --git a/utils/src/main/scala/com/salesforce/op/test/TestSparkContext.scala b/utils/src/main/scala/com/salesforce/op/test/TestSparkContext.scala index 5b24a7ecba..598d8922db 100644 --- a/utils/src/main/scala/com/salesforce/op/test/TestSparkContext.scala +++ b/utils/src/main/scala/com/salesforce/op/test/TestSparkContext.scala @@ -36,9 +36,8 @@ import org.scalatest.Suite trait TestSparkContext extends TempDirectoryTest with TestCommon { self: Suite => - // Remove Logging of OWLQN and LBFGS used in LogisticRegression - Logger.getLogger("breeze.optimize.OWLQN").setLevel(Level.WARN) - Logger.getLogger("breeze.optimize.LBFGS").setLevel(Level.WARN) + // Remove Breeze logging noise + Logger.getLogger("breeze.optimize").setLevel(Level.WARN) lazy val kryoClasses: Array[Class[_]] = Array( classOf[com.salesforce.op.test.Passenger], diff --git a/utils/src/main/scala/com/salesforce/op/utils/reflection/ReflectionUtils.scala b/utils/src/main/scala/com/salesforce/op/utils/reflection/ReflectionUtils.scala index 20ebbe5838..ad00a19699 100644 --- a/utils/src/main/scala/com/salesforce/op/utils/reflection/ReflectionUtils.scala +++ b/utils/src/main/scala/com/salesforce/op/utils/reflection/ReflectionUtils.scala @@ -146,16 +146,34 @@ object ReflectionUtils { def reflectSetterMethod[T: ClassTag]( instance: T, setterName: String, + inputs: Seq[Any], classLoader: ClassLoader = defaultClassLoader - ): Option[MethodMirror] = { + ): Any = { + reflectMethod(instance, s"set$setterName", classLoader).apply(inputs: _*) + } + + /** + * Find setter methods for the provided method name + * @param instance class to find method for + * @param methodName name of method to find + * @param classLoader class loader to use + * @tparam T type of instance to copy + * @return reflected method to set type + */ + def reflectMethod[T: ClassTag]( + instance: T, + methodName: String, + classLoader: ClassLoader = defaultClassLoader + ): MethodMirror = { val klazz = instance.getClass val (runtimeMirror, classMirror) = mirrors(klazz, classLoader) val classType = runtimeMirror.classSymbol(klazz).toType val tMembers = classType.members - val settrs = tMembers.collect { case m: MethodSymbol if m.isPublic && - termNameStr(m.name).compareToIgnoreCase(s"set$setterName") == 0 => m } + val methods = tMembers.collect { case m: MethodSymbol if m.isMethod && + termNameStr(m.name).compareToIgnoreCase(methodName) == 0 => m + } val instanceMirror = runtimeMirror.reflect(instance) - settrs.headOption.map(instanceMirror.reflectMethod(_)) + instanceMirror.reflectMethod(methods.head) } /** @@ -168,6 +186,26 @@ object ReflectionUtils { */ def classForName(name: String, classLoader: ClassLoader = defaultClassLoader): Class[_] = classLoader.loadClass(name) + + /** + * Fully dealiased type name for [[Type]]. + * This method performs a recursive dealising vs a regular type.dealias, which does on one level only. + * + * E.g: given a type of "Map[String,Double]" the result is + * "scala.collection.immutable.Map[java.lang.String,scala.Double]" + * + * @param t type to dealias + * @return fully dealised type name + */ + def dealisedTypeName(t: Type): String = { + val dealised = t.dealias + if (dealised.typeArgs.isEmpty) dealised.typeSymbol.fullName + else { + dealised.typeConstructor.dealias.typeSymbol.fullName + + dealised.typeArgs.map(dealisedTypeName).mkString("[", ",", "]") + } + } + /** * Create a TypeTag for Type * diff --git a/utils/src/main/scala/com/salesforce/op/utils/spark/RichMetadata.scala b/utils/src/main/scala/com/salesforce/op/utils/spark/RichMetadata.scala index 31d5735fe6..753e2b3622 100644 --- a/utils/src/main/scala/com/salesforce/op/utils/spark/RichMetadata.scala +++ b/utils/src/main/scala/com/salesforce/op/utils/spark/RichMetadata.scala @@ -34,6 +34,7 @@ package com.salesforce.op.utils.spark import org.apache.spark.sql.types._ import scala.collection.mutable.{Map => MMap} +import shapeless._ object RichMetadata { @@ -103,7 +104,7 @@ object RichMetadata { case (Some(av: String), Some(bv: String)) => av + bv case (Some(av: Metadata), Some(bv: Metadata)) => av.deepMerge(bv) case (Some(av), Some(bv)) => throw new RuntimeException( - s"Failed to merge metadatas for key $key due to incompatible value types '$av' and '$bv'" + s"Failed to merge metadata for key $key due to incompatible value types '$av' and '$bv'" ) } res += key -> resVal @@ -150,24 +151,40 @@ object RichMetadata { } } + private val booleanSeq = TypeCase[Seq[Boolean]] + private val longSeq = TypeCase[Seq[Long]] + private val intSeq = TypeCase[Seq[Int]] + private val doubleSeq = TypeCase[Seq[Double]] + private val stringSeq = TypeCase[Seq[String]] + /** * Enrichment functions for Maps * @param theMap Map[String, Any] */ implicit class RichMap(val theMap: Map[String, Any]) extends AnyVal { - def toMetadata: Metadata = theMap.foldLeft(new MetadataBuilder()) { - case (m, (k, v: Boolean)) => m.putBoolean(k, v) - case (m, (k, v: Double)) => m.putDouble(k, v) - case (m, (k, v: Long)) => m.putLong(k, v) - case (m, (k, v: String)) => m.putString(k, v) - case (m, (k, v: Array[Boolean])) => m.putBooleanArray(k, v) - case (m, (k, v: Array[Double])) => m.putDoubleArray(k, v) - case (m, (k, v: Array[Long])) => m.putLongArray(k, v) - case (m, (k, v: Array[String])) => m.putStringArray(k, v) - case (_, (k, v)) => throw new RuntimeException(s"Key '$k' has unsupported value type") - }.build() - + def toMetadata: Metadata = { + val builder = new MetadataBuilder() + def unsupported(k: String) = throw new RuntimeException(s"Key '$k' has unsupported value type") + def putCollection(key: String, seq: Seq[Any]): MetadataBuilder = seq match { + case booleanSeq(v) => builder.putBooleanArray(key, v.toArray) + case intSeq(v) => builder.putLongArray(key, v.map(_.toLong).toArray) + case longSeq(v) => builder.putLongArray(key, v.toArray) + case doubleSeq(v) => builder.putDoubleArray(key, v.toArray) + case stringSeq(v) => builder.putStringArray(key, v.toArray) + case _ => unsupported(key) + } + theMap.foldLeft(builder) { + case (m, (k, v: Boolean)) => m.putBoolean(k, v) + case (m, (k, v: Double)) => m.putDouble(k, v) + case (m, (k, v: Long)) => m.putLong(k, v) + case (m, (k, v: String)) => m.putString(k, v) + case (m, (k, v: Seq[_])) => putCollection(k, v) + case (m, (k, v: Array[_])) => putCollection(k, v) + case (m, (k, v: Map[_, _])) => m.putMetadata(k, v.map { case (k, v) => k.toString -> v }.toMetadata) + case (_, (k, _)) => unsupported(k) + }.build() + } } } diff --git a/utils/src/main/scala/com/salesforce/op/utils/text/LanguageDetector.scala b/utils/src/main/scala/com/salesforce/op/utils/text/LanguageDetector.scala index fdfa1311fd..57c8a1606e 100644 --- a/utils/src/main/scala/com/salesforce/op/utils/text/LanguageDetector.scala +++ b/utils/src/main/scala/com/salesforce/op/utils/text/LanguageDetector.scala @@ -66,9 +66,11 @@ object Language extends Enum[Language] { case object Asturian extends Language("ast") case object Belarusian extends Language("be") case object Breton extends Language("br") - case object Catalan extends Language("ca") case object Bulgarian extends Language("bg") case object Bengali extends Language("bn") + case object Brazilian extends Language("br") + case object Catalan extends Language("ca") + case object Sorani extends Language("ckb") case object Czech extends Language("cs") case object Welsh extends Language("cy") case object Danish extends Language("da") @@ -112,6 +114,7 @@ object Language extends Enum[Language] { case object Portuguese extends Language("pt") case object Romanian extends Language("ro") case object Russian extends Language("ru") + case object Sami extends Language("se") case object Slovak extends Language("sk") case object Slovene extends Language("sl") case object Somali extends Language("so") diff --git a/utils/src/main/scala/com/salesforce/op/utils/text/NameEntityTagger.scala b/utils/src/main/scala/com/salesforce/op/utils/text/NameEntityTagger.scala new file mode 100644 index 0000000000..5a1da39261 --- /dev/null +++ b/utils/src/main/scala/com/salesforce/op/utils/text/NameEntityTagger.scala @@ -0,0 +1,88 @@ +/* + * Copyright (c) 2017, Salesforce.com, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of Salesforce.com nor the names of its contributors may + * be used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +package com.salesforce.op.utils.text + +import enumeratum.{Enum, EnumEntry} + +/** + * Interface for Name Entity Recognition tagger + * + * @tparam Result result of the [[NameEntityTagger.tag]] function call + */ +trait NameEntityTagger[Result <: TaggerResult] extends Serializable { + + /** + * Apply the name entity recognition model on the sentence tokens to retrieve information + * + * @param tokens sentence tokens + * @param language language + * @param entitiesToTag entities to tag if found + * @return map of entity and corresponding tokens + */ + def tag(tokens: Seq[String], language: Language, entitiesToTag: Seq[NameEntityType]): Result + +} + +/** + * Result of [[NameEntityTagger.tag]] function call + */ +trait TaggerResult extends Serializable { + + /** + * Result must be convertible to Map, + * where keys are token and values are entities matching each token + */ + def tokenTags: Map[String, Set[NameEntityType]] + +} + + +/** + * Name Entity Recognition entity type + */ +sealed trait NameEntityType extends EnumEntry with Serializable + +/** + * Name Entity Recognition entity type + */ +object NameEntityType extends Enum[NameEntityType] { + val values = findValues + case object Date extends NameEntityType + case object Location extends NameEntityType + case object Money extends NameEntityType + case object Organization extends NameEntityType + case object Percentage extends NameEntityType + case object Person extends NameEntityType + case object Time extends NameEntityType + case object Misc extends NameEntityType + case object Other extends NameEntityType +} diff --git a/utils/src/main/scala/com/salesforce/op/utils/text/SentenceSplitter.scala b/utils/src/main/scala/com/salesforce/op/utils/text/SentenceSplitter.scala new file mode 100644 index 0000000000..fddf2dbeb0 --- /dev/null +++ b/utils/src/main/scala/com/salesforce/op/utils/text/SentenceSplitter.scala @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2017, Salesforce.com, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of Salesforce.com nor the names of its contributors may + * be used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +package com.salesforce.op.utils.text + +/** + * Interface for Sentence Splitter that allows detecting and splitting text + * into separate sentences. + */ +trait SentenceSplitter extends Serializable { + + /** + * Get sentences from the text + * + * @param input text input + * @param language language + * @return sentences + */ + def getSentences(input: String, language: Language): Seq[String] +} diff --git a/utils/src/test/scala/com/salesforce/op/UIDTest.scala b/utils/src/test/scala/com/salesforce/op/UIDTest.scala index 74d6191cf3..3f717b1346 100644 --- a/utils/src/test/scala/com/salesforce/op/UIDTest.scala +++ b/utils/src/test/scala/com/salesforce/op/UIDTest.scala @@ -75,3 +75,4 @@ class UIDTest extends FlatSpec with TestCommon { intercept[IllegalArgumentException](UID.fromString("foo")).getMessage shouldBe "Invalid UID: foo" } } + diff --git a/utils/src/test/scala/com/salesforce/op/utils/reflection/ReflectionUtilsTest.scala b/utils/src/test/scala/com/salesforce/op/utils/reflection/ReflectionUtilsTest.scala index 11e9fb8fa9..639d11e915 100644 --- a/utils/src/test/scala/com/salesforce/op/utils/reflection/ReflectionUtilsTest.scala +++ b/utils/src/test/scala/com/salesforce/op/utils/reflection/ReflectionUtilsTest.scala @@ -79,6 +79,8 @@ class TestClassVar { myVar = Option(s) this } + private def getValue: Int = 2 + def getValuePerf: Int = 2 } @RunWith(classOf[JUnitRunner]) @@ -109,6 +111,14 @@ class ReflectionUtilsTest extends FlatSpec with Matchers { dealiasedTag.tpe shouldBe aliasTag.tpe.dealias } + it should "deep dealias types" in { + val tt = typeTag[Map[String, Seq[(Double, ListStringAlias)]]].tpe + ReflectionUtils.dealisedTypeName(tt) shouldBe + "scala.collection.immutable.Map[" + + "java.lang.String," + + "scala.collection.Seq[scala.Tuple2[scala.Double,scala.collection.immutable.List[java.lang.String]]]]" + } + it should "allow copying a class" in { val orig = new TestClass[TestValClass]( i = 123, @@ -191,9 +201,31 @@ class ReflectionUtilsTest extends FlatSpec with Matchers { it should "allow you to find and use a setter for a class" in { val myClass = new TestClassVar() - val setter = ReflectionUtils.reflectSetterMethod(myClass, "myVar") - setter.map(_.apply("yay")) + val setter = ReflectionUtils.reflectSetterMethod(myClass, "myVar", Seq("yay")) myClass.myVar shouldBe Some("yay") } + it should "allow you to find and use a private method for a class" in { + val myClass = new TestClassVar() + val value = ReflectionUtils.reflectMethod(myClass, "getValue").apply() + value shouldBe 2 + } + + it should "reflected method should be fast to execute" in { + val myClass = new TestClassVar() + val method = ReflectionUtils.reflectMethod(myClass, "getValue") + val max = 100000 + def measure(fun: => Int): Long = { + val start = System.currentTimeMillis() + (0 until max).foreach(_ => fun shouldBe 2) + System.currentTimeMillis() - start + } + val warmUp = measure(method.apply().asInstanceOf[Int]) -> measure(myClass.getValuePerf) // warm up + val elapsedReflect = measure(method.apply().asInstanceOf[Int]) + val actual = measure(myClass.getValuePerf) + + elapsedReflect should be <= 5 * actual + } + } + diff --git a/utils/src/test/scala/com/salesforce/op/utils/spark/RichMetadataTest.scala b/utils/src/test/scala/com/salesforce/op/utils/spark/RichMetadataTest.scala index ae4be94d1b..3007b15601 100644 --- a/utils/src/test/scala/com/salesforce/op/utils/spark/RichMetadataTest.scala +++ b/utils/src/test/scala/com/salesforce/op/utils/spark/RichMetadataTest.scala @@ -33,6 +33,8 @@ package com.salesforce.op.utils.spark import com.salesforce.op.test.TestCommon import org.apache.spark.sql.types.{MetadataBuilder, StructField} +import org.json4s.DefaultFormats +import org.json4s.jackson.Serialization import org.junit.runner.RunWith import org.scalatest.junit.JUnitRunner import org.scalatest.{FlatSpec, Matchers} @@ -46,10 +48,12 @@ class RichMetadataTest extends FlatSpec with TestCommon { Spec[RichMetadata] should "create a metadata from a map" in { val expected = Map( "1" -> 1L, "2" -> 1.0, "3" -> true, "4" -> "1", - "5" -> Array(1L), "6" -> Array(1.0), "6" -> Array(true), "7" -> Array("1") + "5" -> Array(1L), "6" -> Array(1.0), "6" -> Array(true), "7" -> Array("1"), + "8" -> Seq(1L), "9" -> Seq(1.0), "10" -> Seq(true), "11" -> Seq("1") ) val meta = expected.toMetadata - meta.underlyingMap.toSeq shouldBe expected.toSeq + implicit val formats = DefaultFormats + meta.json shouldBe Serialization.write(expected) } it should "throw an error on unsupported type in a map" in {