1# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7#     http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14# ==============================================================================
15"""Tests for estimators.linear."""
16
17from __future__ import absolute_import
18from __future__ import division
19from __future__ import print_function
20
21import functools
22import json
23import tempfile
24
25import numpy as np
26
27from tensorflow.contrib.layers.python.layers import feature_column as feature_column_lib
28from tensorflow.contrib.learn.python.learn import experiment
29from tensorflow.contrib.learn.python.learn.datasets import base
30from tensorflow.contrib.learn.python.learn.estimators import _sklearn
31from tensorflow.contrib.learn.python.learn.estimators import estimator
32from tensorflow.contrib.learn.python.learn.estimators import estimator_test_utils
33from tensorflow.contrib.learn.python.learn.estimators import head as head_lib
34from tensorflow.contrib.learn.python.learn.estimators import linear
35from tensorflow.contrib.learn.python.learn.estimators import run_config
36from tensorflow.contrib.learn.python.learn.estimators import test_data
37from tensorflow.contrib.learn.python.learn.metric_spec import MetricSpec
38from tensorflow.contrib.linear_optimizer.python import sdca_optimizer as sdca_optimizer_lib
39from tensorflow.contrib.metrics.python.ops import metric_ops
40from tensorflow.python.feature_column import feature_column_lib as fc_core
41from tensorflow.python.framework import constant_op
42from tensorflow.python.framework import dtypes
43from tensorflow.python.framework import sparse_tensor
44from tensorflow.python.ops import array_ops
45from tensorflow.python.ops import math_ops
46from tensorflow.python.ops import partitioned_variables
47from tensorflow.python.platform import test
48from tensorflow.python.training import ftrl
49from tensorflow.python.training import input as input_lib
50from tensorflow.python.training import server_lib
51
52
53def _prepare_iris_data_for_logistic_regression():
54  # Converts iris data to a logistic regression problem.
55  iris = base.load_iris()
56  ids = np.where((iris.target == 0) | (iris.target == 1))
57  iris = base.Dataset(data=iris.data[ids], target=iris.target[ids])
58  return iris
59
60
61class LinearClassifierTest(test.TestCase):
62
63  def testExperimentIntegration(self):
64    cont_features = [
65        feature_column_lib.real_valued_column(
66            'feature', dimension=4)
67    ]
68
69    exp = experiment.Experiment(
70        estimator=linear.LinearClassifier(
71            n_classes=3, feature_columns=cont_features),
72        train_input_fn=test_data.iris_input_multiclass_fn,
73        eval_input_fn=test_data.iris_input_multiclass_fn)
74    exp.test()
75
76  def testEstimatorContract(self):
77    estimator_test_utils.assert_estimator_contract(self,
78                                                   linear.LinearClassifier)
79
80  def testTrain(self):
81    """Tests that loss goes down with training."""
82
83    def input_fn():
84      return {
85          'age':
86              constant_op.constant([1]),
87          'language':
88              sparse_tensor.SparseTensor(
89                  values=['english'], indices=[[0, 0]], dense_shape=[1, 1])
90      }, constant_op.constant([[1]])
91
92    language = feature_column_lib.sparse_column_with_hash_bucket('language',
93                                                                 100)
94    age = feature_column_lib.real_valued_column('age')
95
96    classifier = linear.LinearClassifier(feature_columns=[age, language])
97    classifier.fit(input_fn=input_fn, steps=100)
98    loss1 = classifier.evaluate(input_fn=input_fn, steps=1)['loss']
99    classifier.fit(input_fn=input_fn, steps=200)
100    loss2 = classifier.evaluate(input_fn=input_fn, steps=1)['loss']
101    self.assertLess(loss2, loss1)
102    self.assertLess(loss2, 0.01)
103
104  def testJointTrain(self):
105    """Tests that loss goes down with training with joint weights."""
106
107    def input_fn():
108      return {
109          'age':
110              sparse_tensor.SparseTensor(
111                  values=['1'], indices=[[0, 0]], dense_shape=[1, 1]),
112          'language':
113              sparse_tensor.SparseTensor(
114                  values=['english'], indices=[[0, 0]], dense_shape=[1, 1])
115      }, constant_op.constant([[1]])
116
117    language = feature_column_lib.sparse_column_with_hash_bucket('language',
118                                                                 100)
119    age = feature_column_lib.sparse_column_with_hash_bucket('age', 2)
120
121    classifier = linear.LinearClassifier(
122        _joint_weight=True, feature_columns=[age, language])
123    classifier.fit(input_fn=input_fn, steps=100)
124    loss1 = classifier.evaluate(input_fn=input_fn, steps=1)['loss']
125    classifier.fit(input_fn=input_fn, steps=200)
126    loss2 = classifier.evaluate(input_fn=input_fn, steps=1)['loss']
127    self.assertLess(loss2, loss1)
128    self.assertLess(loss2, 0.01)
129
130  def testMultiClass_MatrixData(self):
131    """Tests multi-class classification using matrix data as input."""
132    feature_column = feature_column_lib.real_valued_column(
133        'feature', dimension=4)
134
135    classifier = linear.LinearClassifier(
136        n_classes=3, feature_columns=[feature_column])
137
138    classifier.fit(input_fn=test_data.iris_input_multiclass_fn, steps=100)
139    scores = classifier.evaluate(
140        input_fn=test_data.iris_input_multiclass_fn, steps=100)
141    self.assertGreater(scores['accuracy'], 0.9)
142
143  def testMultiClass_MatrixData_Labels1D(self):
144    """Same as the last test, but labels shape is [150] instead of [150, 1]."""
145
146    def _input_fn():
147      iris = base.load_iris()
148      return {
149          'feature': constant_op.constant(
150              iris.data, dtype=dtypes.float32)
151      }, constant_op.constant(
152          iris.target, shape=[150], dtype=dtypes.int32)
153
154    feature_column = feature_column_lib.real_valued_column(
155        'feature', dimension=4)
156
157    classifier = linear.LinearClassifier(
158        n_classes=3, feature_columns=[feature_column])
159
160    classifier.fit(input_fn=_input_fn, steps=100)
161    scores = classifier.evaluate(input_fn=_input_fn, steps=1)
162    self.assertGreater(scores['accuracy'], 0.9)
163
164  def testMultiClass_NpMatrixData(self):
165    """Tests multi-class classification using numpy matrix data as input."""
166    iris = base.load_iris()
167    train_x = iris.data
168    train_y = iris.target
169    feature_column = feature_column_lib.real_valued_column('', dimension=4)
170    classifier = linear.LinearClassifier(
171        n_classes=3, feature_columns=[feature_column])
172
173    classifier.fit(x=train_x, y=train_y, steps=100)
174    scores = classifier.evaluate(x=train_x, y=train_y, steps=1)
175    self.assertGreater(scores['accuracy'], 0.9)
176
177  def testMultiClassLabelKeys(self):
178    """Tests n_classes > 2 with label_keys vocabulary for labels."""
179    # Byte literals needed for python3 test to pass.
180    label_keys = [b'label0', b'label1', b'label2']
181
182    def _input_fn(num_epochs=None):
183      features = {
184          'language':
185              sparse_tensor.SparseTensor(
186                  values=input_lib.limit_epochs(
187                      ['en', 'fr', 'zh'], num_epochs=num_epochs),
188                  indices=[[0, 0], [0, 1], [2, 0]],
189                  dense_shape=[3, 2])
190      }
191      labels = constant_op.constant(
192          [[label_keys[1]], [label_keys[0]], [label_keys[0]]],
193          dtype=dtypes.string)
194      return features, labels
195
196    language_column = feature_column_lib.sparse_column_with_hash_bucket(
197        'language', hash_bucket_size=20)
198
199    classifier = linear.LinearClassifier(
200        n_classes=3,
201        feature_columns=[language_column],
202        label_keys=label_keys)
203
204    classifier.fit(input_fn=_input_fn, steps=50)
205
206    scores = classifier.evaluate(input_fn=_input_fn, steps=1)
207    self.assertGreater(scores['accuracy'], 0.9)
208    self.assertIn('loss', scores)
209    predict_input_fn = functools.partial(_input_fn, num_epochs=1)
210    predicted_classes = list(
211        classifier.predict_classes(
212            input_fn=predict_input_fn, as_iterable=True))
213    self.assertEqual(3, len(predicted_classes))
214    for pred in predicted_classes:
215      self.assertIn(pred, label_keys)
216    predictions = list(
217        classifier.predict(input_fn=predict_input_fn, as_iterable=True))
218    self.assertAllEqual(predicted_classes, predictions)
219
220  def testLogisticRegression_MatrixData(self):
221    """Tests binary classification using matrix data as input."""
222
223    def _input_fn():
224      iris = _prepare_iris_data_for_logistic_regression()
225      return {
226          'feature': constant_op.constant(
227              iris.data, dtype=dtypes.float32)
228      }, constant_op.constant(
229          iris.target, shape=[100, 1], dtype=dtypes.int32)
230
231    feature_column = feature_column_lib.real_valued_column(
232        'feature', dimension=4)
233
234    classifier = linear.LinearClassifier(feature_columns=[feature_column])
235
236    classifier.fit(input_fn=_input_fn, steps=100)
237    scores = classifier.evaluate(input_fn=_input_fn, steps=1)
238    self.assertGreater(scores['accuracy'], 0.9)
239
240  def testEstimatorWithCoreFeatureColumns(self):
241
242    def _input_fn(num_epochs=None):
243      features = {
244          'age':
245              input_lib.limit_epochs(
246                  constant_op.constant([[.8], [0.2], [.1]]),
247                  num_epochs=num_epochs),
248          'language':
249              sparse_tensor.SparseTensor(
250                  values=input_lib.limit_epochs(
251                      ['en', 'fr', 'zh'], num_epochs=num_epochs),
252                  indices=[[0, 0], [0, 1], [2, 0]],
253                  dense_shape=[3, 2])
254      }
255      return features, constant_op.constant([[1], [0], [0]], dtype=dtypes.int32)
256
257    language_column = fc_core.categorical_column_with_hash_bucket(
258        'language', hash_bucket_size=20)
259    feature_columns = [language_column, fc_core.numeric_column('age')]
260
261    classifier = linear.LinearClassifier(feature_columns=feature_columns)
262    classifier.fit(input_fn=_input_fn, steps=100)
263    scores = classifier.evaluate(input_fn=_input_fn, steps=1)
264    self.assertGreater(scores['accuracy'], 0.9)
265
266  def testLogisticRegression_MatrixData_Labels1D(self):
267    """Same as the last test, but labels shape is [100] instead of [100, 1]."""
268
269    def _input_fn():
270      iris = _prepare_iris_data_for_logistic_regression()
271      return {
272          'feature': constant_op.constant(
273              iris.data, dtype=dtypes.float32)
274      }, constant_op.constant(
275          iris.target, shape=[100], dtype=dtypes.int32)
276
277    feature_column = feature_column_lib.real_valued_column(
278        'feature', dimension=4)
279
280    classifier = linear.LinearClassifier(feature_columns=[feature_column])
281
282    classifier.fit(input_fn=_input_fn, steps=100)
283    scores = classifier.evaluate(input_fn=_input_fn, steps=1)
284    self.assertGreater(scores['accuracy'], 0.9)
285
286  def testLogisticRegression_NpMatrixData(self):
287    """Tests binary classification using numpy matrix data as input."""
288    iris = _prepare_iris_data_for_logistic_regression()
289    train_x = iris.data
290    train_y = iris.target
291    feature_columns = [feature_column_lib.real_valued_column('', dimension=4)]
292    classifier = linear.LinearClassifier(feature_columns=feature_columns)
293
294    classifier.fit(x=train_x, y=train_y, steps=100)
295    scores = classifier.evaluate(x=train_x, y=train_y, steps=1)
296    self.assertGreater(scores['accuracy'], 0.9)
297
298  def testWeightAndBiasNames(self):
299    """Tests that weight and bias names haven't changed."""
300    feature_column = feature_column_lib.real_valued_column(
301        'feature', dimension=4)
302
303    classifier = linear.LinearClassifier(
304        n_classes=3, feature_columns=[feature_column])
305
306    classifier.fit(input_fn=test_data.iris_input_multiclass_fn, steps=100)
307
308    variable_names = classifier.get_variable_names()
309    self.assertIn('linear/feature/weight', variable_names)
310    self.assertIn('linear/bias_weight', variable_names)
311    self.assertEqual(
312        4, len(classifier.get_variable_value('linear/feature/weight')))
313    self.assertEqual(
314        3, len(classifier.get_variable_value('linear/bias_weight')))
315
316  def testCustomOptimizerByObject(self):
317    """Tests multi-class classification using matrix data as input."""
318    feature_column = feature_column_lib.real_valued_column(
319        'feature', dimension=4)
320
321    classifier = linear.LinearClassifier(
322        n_classes=3,
323        optimizer=ftrl.FtrlOptimizer(learning_rate=0.1),
324        feature_columns=[feature_column])
325
326    classifier.fit(input_fn=test_data.iris_input_multiclass_fn, steps=100)
327    scores = classifier.evaluate(
328        input_fn=test_data.iris_input_multiclass_fn, steps=100)
329    self.assertGreater(scores['accuracy'], 0.9)
330
331  def testCustomOptimizerByString(self):
332    """Tests multi-class classification using matrix data as input."""
333    feature_column = feature_column_lib.real_valued_column(
334        'feature', dimension=4)
335
336    def _optimizer():
337      return ftrl.FtrlOptimizer(learning_rate=0.1)
338
339    classifier = linear.LinearClassifier(
340        n_classes=3, optimizer=_optimizer, feature_columns=[feature_column])
341
342    classifier.fit(input_fn=test_data.iris_input_multiclass_fn, steps=100)
343    scores = classifier.evaluate(
344        input_fn=test_data.iris_input_multiclass_fn, steps=100)
345    self.assertGreater(scores['accuracy'], 0.9)
346
347  def testCustomOptimizerByFunction(self):
348    """Tests multi-class classification using matrix data as input."""
349    feature_column = feature_column_lib.real_valued_column(
350        'feature', dimension=4)
351
352    classifier = linear.LinearClassifier(
353        n_classes=3, optimizer='Ftrl', feature_columns=[feature_column])
354
355    classifier.fit(input_fn=test_data.iris_input_multiclass_fn, steps=100)
356    scores = classifier.evaluate(
357        input_fn=test_data.iris_input_multiclass_fn, steps=100)
358    self.assertGreater(scores['accuracy'], 0.9)
359
360  def testCustomMetrics(self):
361    """Tests custom evaluation metrics."""
362
363    def _input_fn(num_epochs=None):
364      # Create 4 rows, one of them (y = x), three of them (y=Not(x))
365      labels = constant_op.constant([[1], [0], [0], [0]], dtype=dtypes.float32)
366      features = {
367          'x':
368              input_lib.limit_epochs(
369                  array_ops.ones(
370                      shape=[4, 1], dtype=dtypes.float32),
371                  num_epochs=num_epochs)
372      }
373      return features, labels
374
375    def _my_metric_op(predictions, labels):
376      # For the case of binary classification, the 2nd column of "predictions"
377      # denotes the model predictions.
378      predictions = array_ops.strided_slice(
379          predictions, [0, 1], [-1, 2], end_mask=1)
380      return math_ops.reduce_sum(math_ops.multiply(predictions, labels))
381
382    classifier = linear.LinearClassifier(
383        feature_columns=[feature_column_lib.real_valued_column('x')])
384
385    classifier.fit(input_fn=_input_fn, steps=100)
386    scores = classifier.evaluate(
387        input_fn=_input_fn,
388        steps=100,
389        metrics={
390            'my_accuracy':
391                MetricSpec(
392                    metric_fn=metric_ops.streaming_accuracy,
393                    prediction_key='classes'),
394            'my_precision':
395                MetricSpec(
396                    metric_fn=metric_ops.streaming_precision,
397                    prediction_key='classes'),
398            'my_metric':
399                MetricSpec(
400                    metric_fn=_my_metric_op, prediction_key='probabilities')
401        })
402    self.assertTrue(
403        set(['loss', 'my_accuracy', 'my_precision', 'my_metric']).issubset(
404            set(scores.keys())))
405    predict_input_fn = functools.partial(_input_fn, num_epochs=1)
406    predictions = np.array(list(classifier.predict_classes(
407        input_fn=predict_input_fn)))
408    self.assertEqual(
409        _sklearn.accuracy_score([1, 0, 0, 0], predictions),
410        scores['my_accuracy'])
411
412    # Tests the case where the prediction_key is neither "classes" nor
413    # "probabilities".
414    with self.assertRaisesRegexp(KeyError, 'bad_type'):
415      classifier.evaluate(
416          input_fn=_input_fn,
417          steps=100,
418          metrics={
419              'bad_name':
420                  MetricSpec(
421                      metric_fn=metric_ops.streaming_auc,
422                      prediction_key='bad_type')
423          })
424
425    # Tests the case where the 2nd element of the key is neither "classes" nor
426    # "probabilities".
427    with self.assertRaises(KeyError):
428      classifier.evaluate(
429          input_fn=_input_fn,
430          steps=100,
431          metrics={('bad_name', 'bad_type'): metric_ops.streaming_auc})
432
433    # Tests the case where the tuple of the key doesn't have 2 elements.
434    with self.assertRaises(ValueError):
435      classifier.evaluate(
436          input_fn=_input_fn,
437          steps=100,
438          metrics={
439              ('bad_length_name', 'classes', 'bad_length'):
440                  metric_ops.streaming_accuracy
441          })
442
443  def testLogisticFractionalLabels(self):
444    """Tests logistic training with fractional labels."""
445
446    def input_fn(num_epochs=None):
447      return {
448          'age':
449              input_lib.limit_epochs(
450                  constant_op.constant([[1], [2]]), num_epochs=num_epochs),
451      }, constant_op.constant(
452          [[.7], [0]], dtype=dtypes.float32)
453
454    age = feature_column_lib.real_valued_column('age')
455
456    classifier = linear.LinearClassifier(
457        feature_columns=[age], config=run_config.RunConfig(tf_random_seed=1))
458    classifier.fit(input_fn=input_fn, steps=500)
459
460    predict_input_fn = functools.partial(input_fn, num_epochs=1)
461    predictions_proba = list(
462        classifier.predict_proba(input_fn=predict_input_fn))
463    # Prediction probabilities mirror the labels column, which proves that the
464    # classifier learns from float input.
465    self.assertAllClose([[.3, .7], [1., 0.]], predictions_proba, atol=.1)
466
467  def testTrainWithPartitionedVariables(self):
468    """Tests training with partitioned variables."""
469
470    def _input_fn():
471      features = {
472          'language':
473              sparse_tensor.SparseTensor(
474                  values=['en', 'fr', 'zh'],
475                  indices=[[0, 0], [0, 1], [2, 0]],
476                  dense_shape=[3, 2])
477      }
478      labels = constant_op.constant([[1], [0], [0]])
479      return features, labels
480
481    sparse_features = [
482        # The given hash_bucket_size results in variables larger than the
483        # default min_slice_size attribute, so the variables are partitioned.
484        feature_column_lib.sparse_column_with_hash_bucket(
485            'language', hash_bucket_size=2e7)
486    ]
487
488    tf_config = {
489        'cluster': {
490            run_config.TaskType.PS: ['fake_ps_0', 'fake_ps_1']
491        }
492    }
493    with test.mock.patch.dict('os.environ',
494                              {'TF_CONFIG': json.dumps(tf_config)}):
495      config = run_config.RunConfig()
496      # Because we did not start a distributed cluster, we need to pass an
497      # empty ClusterSpec, otherwise the device_setter will look for
498      # distributed jobs, such as "/job:ps" which are not present.
499      config._cluster_spec = server_lib.ClusterSpec({})
500
501    classifier = linear.LinearClassifier(
502        feature_columns=sparse_features, config=config)
503    classifier.fit(input_fn=_input_fn, steps=200)
504    loss = classifier.evaluate(input_fn=_input_fn, steps=1)['loss']
505    self.assertLess(loss, 0.07)
506
507  def testTrainSaveLoad(self):
508    """Tests that insures you can save and reload a trained model."""
509
510    def input_fn(num_epochs=None):
511      return {
512          'age':
513              input_lib.limit_epochs(
514                  constant_op.constant([1]), num_epochs=num_epochs),
515          'language':
516              sparse_tensor.SparseTensor(
517                  values=['english'], indices=[[0, 0]], dense_shape=[1, 1]),
518      }, constant_op.constant([[1]])
519
520    language = feature_column_lib.sparse_column_with_hash_bucket('language',
521                                                                 100)
522    age = feature_column_lib.real_valued_column('age')
523
524    model_dir = tempfile.mkdtemp()
525    classifier = linear.LinearClassifier(
526        model_dir=model_dir, feature_columns=[age, language])
527    classifier.fit(input_fn=input_fn, steps=30)
528    predict_input_fn = functools.partial(input_fn, num_epochs=1)
529    out1_class = list(
530        classifier.predict_classes(
531            input_fn=predict_input_fn, as_iterable=True))
532    out1_proba = list(
533        classifier.predict_proba(
534            input_fn=predict_input_fn, as_iterable=True))
535    del classifier
536
537    classifier2 = linear.LinearClassifier(
538        model_dir=model_dir, feature_columns=[age, language])
539    out2_class = list(
540        classifier2.predict_classes(
541            input_fn=predict_input_fn, as_iterable=True))
542    out2_proba = list(
543        classifier2.predict_proba(
544            input_fn=predict_input_fn, as_iterable=True))
545    self.assertTrue(np.array_equal(out1_class, out2_class))
546    self.assertTrue(np.array_equal(out1_proba, out2_proba))
547
548  def testWeightColumn(self):
549    """Tests training with given weight column."""
550
551    def _input_fn_train():
552      # Create 4 rows, one of them (y = x), three of them (y=Not(x))
553      # First row has more weight than others. Model should fit (y=x) better
554      # than (y=Not(x)) due to the relative higher weight of the first row.
555      labels = constant_op.constant([[1], [0], [0], [0]])
556      features = {
557          'x': array_ops.ones(
558              shape=[4, 1], dtype=dtypes.float32),
559          'w': constant_op.constant([[100.], [3.], [2.], [2.]])
560      }
561      return features, labels
562
563    def _input_fn_eval():
564      # Create 4 rows (y = x)
565      labels = constant_op.constant([[1], [1], [1], [1]])
566      features = {
567          'x': array_ops.ones(
568              shape=[4, 1], dtype=dtypes.float32),
569          'w': constant_op.constant([[1.], [1.], [1.], [1.]])
570      }
571      return features, labels
572
573    classifier = linear.LinearClassifier(
574        weight_column_name='w',
575        feature_columns=[feature_column_lib.real_valued_column('x')],
576        config=run_config.RunConfig(tf_random_seed=3))
577
578    classifier.fit(input_fn=_input_fn_train, steps=100)
579    scores = classifier.evaluate(input_fn=_input_fn_eval, steps=1)
580    # All examples in eval data set are y=x.
581    self.assertGreater(scores['labels/actual_label_mean'], 0.9)
582    # If there were no weight column, model would learn y=Not(x). Because of
583    # weights, it learns y=x.
584    self.assertGreater(scores['labels/prediction_mean'], 0.9)
585    # All examples in eval data set are y=x. So if weight column were ignored,
586    # then accuracy would be zero. Because of weights, accuracy should be close
587    # to 1.0.
588    self.assertGreater(scores['accuracy'], 0.9)
589
590    scores_train_set = classifier.evaluate(input_fn=_input_fn_train, steps=1)
591    # Considering weights, the mean label should be close to 1.0.
592    # If weights were ignored, it would be 0.25.
593    self.assertGreater(scores_train_set['labels/actual_label_mean'], 0.9)
594    # The classifier has learned y=x.  If weight column were ignored in
595    # evaluation, then accuracy for the train set would be 0.25.
596    # Because weight is not ignored, accuracy is greater than 0.6.
597    self.assertGreater(scores_train_set['accuracy'], 0.6)
598
599  def testWeightColumnLoss(self):
600    """Test ensures that you can specify per-example weights for loss."""
601
602    def _input_fn():
603      features = {
604          'age': constant_op.constant([[20], [20], [20]]),
605          'weights': constant_op.constant([[100], [1], [1]]),
606      }
607      labels = constant_op.constant([[1], [0], [0]])
608      return features, labels
609
610    age = feature_column_lib.real_valued_column('age')
611
612    classifier = linear.LinearClassifier(feature_columns=[age])
613    classifier.fit(input_fn=_input_fn, steps=100)
614    loss_unweighted = classifier.evaluate(input_fn=_input_fn, steps=1)['loss']
615
616    classifier = linear.LinearClassifier(
617        feature_columns=[age], weight_column_name='weights')
618    classifier.fit(input_fn=_input_fn, steps=100)
619    loss_weighted = classifier.evaluate(input_fn=_input_fn, steps=1)['loss']
620
621    self.assertLess(loss_weighted, loss_unweighted)
622
623  def testExport(self):
624    """Tests that export model for servo works."""
625
626    def input_fn():
627      return {
628          'age':
629              constant_op.constant([1]),
630          'language':
631              sparse_tensor.SparseTensor(
632                  values=['english'], indices=[[0, 0]], dense_shape=[1, 1])
633      }, constant_op.constant([[1]])
634
635    language = feature_column_lib.sparse_column_with_hash_bucket('language',
636                                                                 100)
637    age = feature_column_lib.real_valued_column('age')
638
639    classifier = linear.LinearClassifier(feature_columns=[age, language])
640    classifier.fit(input_fn=input_fn, steps=100)
641
642    export_dir = tempfile.mkdtemp()
643    classifier.export(export_dir)
644
645  def testDisableCenteredBias(self):
646    """Tests that we can disable centered bias."""
647
648    def input_fn():
649      return {
650          'age':
651              constant_op.constant([1]),
652          'language':
653              sparse_tensor.SparseTensor(
654                  values=['english'], indices=[[0, 0]], dense_shape=[1, 1])
655      }, constant_op.constant([[1]])
656
657    language = feature_column_lib.sparse_column_with_hash_bucket('language',
658                                                                 100)
659    age = feature_column_lib.real_valued_column('age')
660
661    classifier = linear.LinearClassifier(
662        feature_columns=[age, language], enable_centered_bias=False)
663    classifier.fit(input_fn=input_fn, steps=100)
664    self.assertNotIn('centered_bias_weight', classifier.get_variable_names())
665
666  def testEnableCenteredBias(self):
667    """Tests that we can enable centered bias."""
668
669    def input_fn():
670      return {
671          'age':
672              constant_op.constant([1]),
673          'language':
674              sparse_tensor.SparseTensor(
675                  values=['english'], indices=[[0, 0]], dense_shape=[1, 1])
676      }, constant_op.constant([[1]])
677
678    language = feature_column_lib.sparse_column_with_hash_bucket('language',
679                                                                 100)
680    age = feature_column_lib.real_valued_column('age')
681
682    classifier = linear.LinearClassifier(
683        feature_columns=[age, language], enable_centered_bias=True)
684    classifier.fit(input_fn=input_fn, steps=100)
685    self.assertIn('linear/binary_logistic_head/centered_bias_weight',
686                  classifier.get_variable_names())
687
688  def testTrainOptimizerWithL1Reg(self):
689    """Tests l1 regularized model has higher loss."""
690
691    def input_fn():
692      return {
693          'language':
694              sparse_tensor.SparseTensor(
695                  values=['hindi'], indices=[[0, 0]], dense_shape=[1, 1])
696      }, constant_op.constant([[1]])
697
698    language = feature_column_lib.sparse_column_with_hash_bucket('language',
699                                                                 100)
700    classifier_no_reg = linear.LinearClassifier(feature_columns=[language])
701    classifier_with_reg = linear.LinearClassifier(
702        feature_columns=[language],
703        optimizer=ftrl.FtrlOptimizer(
704            learning_rate=1.0, l1_regularization_strength=100.))
705    loss_no_reg = classifier_no_reg.fit(input_fn=input_fn, steps=100).evaluate(
706        input_fn=input_fn, steps=1)['loss']
707    loss_with_reg = classifier_with_reg.fit(input_fn=input_fn,
708                                            steps=100).evaluate(
709                                                input_fn=input_fn,
710                                                steps=1)['loss']
711    self.assertLess(loss_no_reg, loss_with_reg)
712
713  def testTrainWithMissingFeature(self):
714    """Tests that training works with missing features."""
715
716    def input_fn():
717      return {
718          'language':
719              sparse_tensor.SparseTensor(
720                  values=['Swahili', 'turkish'],
721                  indices=[[0, 0], [2, 0]],
722                  dense_shape=[3, 1])
723      }, constant_op.constant([[1], [1], [1]])
724
725    language = feature_column_lib.sparse_column_with_hash_bucket('language',
726                                                                 100)
727    classifier = linear.LinearClassifier(feature_columns=[language])
728    classifier.fit(input_fn=input_fn, steps=100)
729    loss = classifier.evaluate(input_fn=input_fn, steps=1)['loss']
730    self.assertLess(loss, 0.07)
731
732  def testSdcaOptimizerRealValuedFeatures(self):
733    """Tests LinearClassifier with SDCAOptimizer and real valued features."""
734
735    def input_fn():
736      return {
737          'example_id': constant_op.constant(['1', '2']),
738          'maintenance_cost': constant_op.constant([[500.0], [200.0]]),
739          'sq_footage': constant_op.constant([[800.0], [600.0]]),
740          'weights': constant_op.constant([[1.0], [1.0]])
741      }, constant_op.constant([[0], [1]])
742
743    maintenance_cost = feature_column_lib.real_valued_column('maintenance_cost')
744    sq_footage = feature_column_lib.real_valued_column('sq_footage')
745    sdca_optimizer = sdca_optimizer_lib.SDCAOptimizer(
746        example_id_column='example_id')
747    classifier = linear.LinearClassifier(
748        feature_columns=[maintenance_cost, sq_footage],
749        weight_column_name='weights',
750        optimizer=sdca_optimizer)
751    classifier.fit(input_fn=input_fn, steps=100)
752    loss = classifier.evaluate(input_fn=input_fn, steps=1)['loss']
753    self.assertLess(loss, 0.05)
754
755  def testSdcaOptimizerRealValuedFeatureWithHigherDimension(self):
756    """Tests SDCAOptimizer with real valued features of higher dimension."""
757
758    # input_fn is identical to the one in testSdcaOptimizerRealValuedFeatures
759    # where 2 1-dimensional dense features have been replaced by 1 2-dimensional
760    # feature.
761    def input_fn():
762      return {
763          'example_id':
764              constant_op.constant(['1', '2']),
765          'dense_feature':
766              constant_op.constant([[500.0, 800.0], [200.0, 600.0]])
767      }, constant_op.constant([[0], [1]])
768
769    dense_feature = feature_column_lib.real_valued_column(
770        'dense_feature', dimension=2)
771    sdca_optimizer = sdca_optimizer_lib.SDCAOptimizer(
772        example_id_column='example_id')
773    classifier = linear.LinearClassifier(
774        feature_columns=[dense_feature], optimizer=sdca_optimizer)
775    classifier.fit(input_fn=input_fn, steps=100)
776    loss = classifier.evaluate(input_fn=input_fn, steps=1)['loss']
777    self.assertLess(loss, 0.05)
778
779  def testSdcaOptimizerBucketizedFeatures(self):
780    """Tests LinearClassifier with SDCAOptimizer and bucketized features."""
781
782    def input_fn():
783      return {
784          'example_id': constant_op.constant(['1', '2', '3']),
785          'price': constant_op.constant([[600.0], [1000.0], [400.0]]),
786          'sq_footage': constant_op.constant([[1000.0], [600.0], [700.0]]),
787          'weights': constant_op.constant([[1.0], [1.0], [1.0]])
788      }, constant_op.constant([[1], [0], [1]])
789
790    price_bucket = feature_column_lib.bucketized_column(
791        feature_column_lib.real_valued_column('price'),
792        boundaries=[500.0, 700.0])
793    sq_footage_bucket = feature_column_lib.bucketized_column(
794        feature_column_lib.real_valued_column('sq_footage'), boundaries=[650.0])
795    sdca_optimizer = sdca_optimizer_lib.SDCAOptimizer(
796        example_id_column='example_id', symmetric_l2_regularization=1.0)
797    classifier = linear.LinearClassifier(
798        feature_columns=[price_bucket, sq_footage_bucket],
799        weight_column_name='weights',
800        optimizer=sdca_optimizer)
801    classifier.fit(input_fn=input_fn, steps=50)
802    scores = classifier.evaluate(input_fn=input_fn, steps=1)
803    self.assertGreater(scores['accuracy'], 0.9)
804
805  def testSdcaOptimizerSparseFeatures(self):
806    """Tests LinearClassifier with SDCAOptimizer and sparse features."""
807
808    def input_fn():
809      return {
810          'example_id':
811              constant_op.constant(['1', '2', '3']),
812          'price':
813              constant_op.constant([0.4, 0.6, 0.3]),
814          'country':
815              sparse_tensor.SparseTensor(
816                  values=['IT', 'US', 'GB'],
817                  indices=[[0, 0], [1, 3], [2, 1]],
818                  dense_shape=[3, 5]),
819          'weights':
820              constant_op.constant([[1.0], [1.0], [1.0]])
821      }, constant_op.constant([[1], [0], [1]])
822
823    price = feature_column_lib.real_valued_column('price')
824    country = feature_column_lib.sparse_column_with_hash_bucket(
825        'country', hash_bucket_size=5)
826    sdca_optimizer = sdca_optimizer_lib.SDCAOptimizer(
827        example_id_column='example_id')
828    classifier = linear.LinearClassifier(
829        feature_columns=[price, country],
830        weight_column_name='weights',
831        optimizer=sdca_optimizer)
832    classifier.fit(input_fn=input_fn, steps=50)
833    scores = classifier.evaluate(input_fn=input_fn, steps=1)
834    self.assertGreater(scores['accuracy'], 0.9)
835
836  def testSdcaOptimizerWeightedSparseFeatures(self):
837    """LinearClassifier with SDCAOptimizer and weighted sparse features."""
838
839    def input_fn():
840      return {
841          'example_id':
842              constant_op.constant(['1', '2', '3']),
843          'price':
844              sparse_tensor.SparseTensor(
845                  values=[2., 3., 1.],
846                  indices=[[0, 0], [1, 0], [2, 0]],
847                  dense_shape=[3, 5]),
848          'country':
849              sparse_tensor.SparseTensor(
850                  values=['IT', 'US', 'GB'],
851                  indices=[[0, 0], [1, 0], [2, 0]],
852                  dense_shape=[3, 5])
853      }, constant_op.constant([[1], [0], [1]])
854
855    country = feature_column_lib.sparse_column_with_hash_bucket(
856        'country', hash_bucket_size=5)
857    country_weighted_by_price = feature_column_lib.weighted_sparse_column(
858        country, 'price')
859    sdca_optimizer = sdca_optimizer_lib.SDCAOptimizer(
860        example_id_column='example_id')
861    classifier = linear.LinearClassifier(
862        feature_columns=[country_weighted_by_price], optimizer=sdca_optimizer)
863    classifier.fit(input_fn=input_fn, steps=50)
864    scores = classifier.evaluate(input_fn=input_fn, steps=1)
865    self.assertGreater(scores['accuracy'], 0.9)
866
867  def testSdcaOptimizerWeightedSparseFeaturesOOVWithNoOOVBuckets(self):
868    """LinearClassifier with SDCAOptimizer with OOV features (-1 IDs)."""
869
870    def input_fn():
871      return {
872          'example_id':
873              constant_op.constant(['1', '2', '3']),
874          'price':
875              sparse_tensor.SparseTensor(
876                  values=[2., 3., 1.],
877                  indices=[[0, 0], [1, 0], [2, 0]],
878                  dense_shape=[3, 5]),
879          'country':
880              sparse_tensor.SparseTensor(
881                  # 'GB' is out of the vocabulary.
882                  values=['IT', 'US', 'GB'],
883                  indices=[[0, 0], [1, 0], [2, 0]],
884                  dense_shape=[3, 5])
885      }, constant_op.constant([[1], [0], [1]])
886
887    country = feature_column_lib.sparse_column_with_keys(
888        'country', keys=['US', 'CA', 'MK', 'IT', 'CN'])
889    country_weighted_by_price = feature_column_lib.weighted_sparse_column(
890        country, 'price')
891    sdca_optimizer = sdca_optimizer_lib.SDCAOptimizer(
892        example_id_column='example_id')
893    classifier = linear.LinearClassifier(
894        feature_columns=[country_weighted_by_price], optimizer=sdca_optimizer)
895    classifier.fit(input_fn=input_fn, steps=50)
896    scores = classifier.evaluate(input_fn=input_fn, steps=1)
897    self.assertGreater(scores['accuracy'], 0.9)
898
899  def testSdcaOptimizerCrossedFeatures(self):
900    """Tests LinearClassifier with SDCAOptimizer and crossed features."""
901
902    def input_fn():
903      return {
904          'example_id':
905              constant_op.constant(['1', '2', '3']),
906          'language':
907              sparse_tensor.SparseTensor(
908                  values=['english', 'italian', 'spanish'],
909                  indices=[[0, 0], [1, 0], [2, 0]],
910                  dense_shape=[3, 1]),
911          'country':
912              sparse_tensor.SparseTensor(
913                  values=['US', 'IT', 'MX'],
914                  indices=[[0, 0], [1, 0], [2, 0]],
915                  dense_shape=[3, 1])
916      }, constant_op.constant([[0], [0], [1]])
917
918    language = feature_column_lib.sparse_column_with_hash_bucket(
919        'language', hash_bucket_size=5)
920    country = feature_column_lib.sparse_column_with_hash_bucket(
921        'country', hash_bucket_size=5)
922    country_language = feature_column_lib.crossed_column(
923        [language, country], hash_bucket_size=10)
924    sdca_optimizer = sdca_optimizer_lib.SDCAOptimizer(
925        example_id_column='example_id')
926    classifier = linear.LinearClassifier(
927        feature_columns=[country_language], optimizer=sdca_optimizer)
928    classifier.fit(input_fn=input_fn, steps=10)
929    scores = classifier.evaluate(input_fn=input_fn, steps=1)
930    self.assertGreater(scores['accuracy'], 0.9)
931
932  def testSdcaOptimizerMixedFeatures(self):
933    """Tests LinearClassifier with SDCAOptimizer and a mix of features."""
934
935    def input_fn():
936      return {
937          'example_id':
938              constant_op.constant(['1', '2', '3']),
939          'price':
940              constant_op.constant([[0.6], [0.8], [0.3]]),
941          'sq_footage':
942              constant_op.constant([[900.0], [700.0], [600.0]]),
943          'country':
944              sparse_tensor.SparseTensor(
945                  values=['IT', 'US', 'GB'],
946                  indices=[[0, 0], [1, 3], [2, 1]],
947                  dense_shape=[3, 5]),
948          'weights':
949              constant_op.constant([[3.0], [1.0], [1.0]])
950      }, constant_op.constant([[1], [0], [1]])
951
952    price = feature_column_lib.real_valued_column('price')
953    sq_footage_bucket = feature_column_lib.bucketized_column(
954        feature_column_lib.real_valued_column('sq_footage'),
955        boundaries=[650.0, 800.0])
956    country = feature_column_lib.sparse_column_with_hash_bucket(
957        'country', hash_bucket_size=5)
958    sq_footage_country = feature_column_lib.crossed_column(
959        [sq_footage_bucket, country], hash_bucket_size=10)
960    sdca_optimizer = sdca_optimizer_lib.SDCAOptimizer(
961        example_id_column='example_id')
962    classifier = linear.LinearClassifier(
963        feature_columns=[price, sq_footage_bucket, country, sq_footage_country],
964        weight_column_name='weights',
965        optimizer=sdca_optimizer)
966    classifier.fit(input_fn=input_fn, steps=50)
967    scores = classifier.evaluate(input_fn=input_fn, steps=1)
968    self.assertGreater(scores['accuracy'], 0.9)
969
970  def testSdcaOptimizerPartitionedVariables(self):
971    """Tests LinearClassifier with SDCAOptimizer with partitioned variables."""
972
973    def input_fn():
974      return {
975          'example_id':
976              constant_op.constant(['1', '2', '3']),
977          'price':
978              constant_op.constant([[0.6], [0.8], [0.3]]),
979          'sq_footage':
980              constant_op.constant([[900.0], [700.0], [600.0]]),
981          'country':
982              sparse_tensor.SparseTensor(
983                  values=['IT', 'US', 'GB'],
984                  indices=[[0, 0], [1, 3], [2, 1]],
985                  dense_shape=[3, 5]),
986          'weights':
987              constant_op.constant([[3.0], [1.0], [1.0]])
988      }, constant_op.constant([[1], [0], [1]])
989
990    price = feature_column_lib.real_valued_column('price')
991    sq_footage_bucket = feature_column_lib.bucketized_column(
992        feature_column_lib.real_valued_column('sq_footage'),
993        boundaries=[650.0, 800.0])
994    country = feature_column_lib.sparse_column_with_hash_bucket(
995        'country', hash_bucket_size=5)
996    sq_footage_country = feature_column_lib.crossed_column(
997        [sq_footage_bucket, country], hash_bucket_size=10)
998
999    sdca_optimizer = sdca_optimizer_lib.SDCAOptimizer(
1000        example_id_column='example_id',
1001        partitioner=partitioned_variables.fixed_size_partitioner(
1002            num_shards=2, axis=0))
1003
1004    tf_config = {
1005        'cluster': {
1006            run_config.TaskType.PS: ['fake_ps_0', 'fake_ps_1']
1007        }
1008    }
1009    with test.mock.patch.dict('os.environ',
1010                              {'TF_CONFIG': json.dumps(tf_config)}):
1011      config = run_config.RunConfig()
1012      # Because we did not start a distributed cluster, we need to pass an
1013      # empty ClusterSpec, otherwise the device_setter will look for
1014      # distributed jobs, such as "/job:ps" which are not present.
1015      config._cluster_spec = server_lib.ClusterSpec({})
1016
1017    classifier = linear.LinearClassifier(
1018        feature_columns=[price, sq_footage_bucket, country, sq_footage_country],
1019        weight_column_name='weights',
1020        optimizer=sdca_optimizer,
1021        config=config)
1022    classifier.fit(input_fn=input_fn, steps=50)
1023    scores = classifier.evaluate(input_fn=input_fn, steps=1)
1024    print('all scores = {}'.format(scores))
1025    self.assertGreater(scores['accuracy'], 0.9)
1026
1027  def testEval(self):
1028    """Tests that eval produces correct metrics.
1029    """
1030
1031    def input_fn():
1032      return {
1033          'age':
1034              constant_op.constant([[1], [2]]),
1035          'language':
1036              sparse_tensor.SparseTensor(
1037                  values=['greek', 'chinese'],
1038                  indices=[[0, 0], [1, 0]],
1039                  dense_shape=[2, 1]),
1040      }, constant_op.constant([[1], [0]])
1041
1042    language = feature_column_lib.sparse_column_with_hash_bucket('language',
1043                                                                 100)
1044    age = feature_column_lib.real_valued_column('age')
1045    classifier = linear.LinearClassifier(feature_columns=[age, language])
1046
1047    # Evaluate on trained model
1048    classifier.fit(input_fn=input_fn, steps=100)
1049    classifier.evaluate(input_fn=input_fn, steps=1)
1050
1051    # TODO(ispir): Enable accuracy check after resolving the randomness issue.
1052    # self.assertLess(evaluated_values['loss/mean'], 0.3)
1053    # self.assertGreater(evaluated_values['accuracy/mean'], .95)
1054
1055
1056class LinearRegressorTest(test.TestCase):
1057
1058  def testExperimentIntegration(self):
1059    cont_features = [
1060        feature_column_lib.real_valued_column(
1061            'feature', dimension=4)
1062    ]
1063
1064    exp = experiment.Experiment(
1065        estimator=linear.LinearRegressor(feature_columns=cont_features),
1066        train_input_fn=test_data.iris_input_logistic_fn,
1067        eval_input_fn=test_data.iris_input_logistic_fn)
1068    exp.test()
1069
1070  def testEstimatorContract(self):
1071    estimator_test_utils.assert_estimator_contract(self, linear.LinearRegressor)
1072
1073  def testRegression(self):
1074    """Tests that loss goes down with training."""
1075
1076    def input_fn():
1077      return {
1078          'age':
1079              constant_op.constant([1]),
1080          'language':
1081              sparse_tensor.SparseTensor(
1082                  values=['english'], indices=[[0, 0]], dense_shape=[1, 1])
1083      }, constant_op.constant([[10.]])
1084
1085    language = feature_column_lib.sparse_column_with_hash_bucket('language',
1086                                                                 100)
1087    age = feature_column_lib.real_valued_column('age')
1088
1089    classifier = linear.LinearRegressor(feature_columns=[age, language])
1090    classifier.fit(input_fn=input_fn, steps=100)
1091    loss1 = classifier.evaluate(input_fn=input_fn, steps=1)['loss']
1092    classifier.fit(input_fn=input_fn, steps=200)
1093    loss2 = classifier.evaluate(input_fn=input_fn, steps=1)['loss']
1094
1095    self.assertLess(loss2, loss1)
1096    self.assertLess(loss2, 0.5)
1097
1098  def testRegression_MatrixData(self):
1099    """Tests regression using matrix data as input."""
1100    cont_features = [
1101        feature_column_lib.real_valued_column(
1102            'feature', dimension=4)
1103    ]
1104
1105    regressor = linear.LinearRegressor(
1106        feature_columns=cont_features,
1107        config=run_config.RunConfig(tf_random_seed=1))
1108
1109    regressor.fit(input_fn=test_data.iris_input_multiclass_fn, steps=100)
1110    scores = regressor.evaluate(
1111        input_fn=test_data.iris_input_multiclass_fn, steps=1)
1112    self.assertLess(scores['loss'], 0.2)
1113
1114  def testRegression_TensorData(self):
1115    """Tests regression using tensor data as input."""
1116
1117    def _input_fn(num_epochs=None):
1118      features = {
1119          'age':
1120              input_lib.limit_epochs(
1121                  constant_op.constant([[0.8], [0.15], [0.]]),
1122                  num_epochs=num_epochs),
1123          'language':
1124              sparse_tensor.SparseTensor(
1125                  values=['en', 'fr', 'zh'],
1126                  indices=[[0, 0], [0, 1], [2, 0]],
1127                  dense_shape=[3, 2])
1128      }
1129      return features, constant_op.constant(
1130          [1.0, 0., 0.2], dtype=dtypes.float32)
1131
1132    feature_columns = [
1133        feature_column_lib.sparse_column_with_hash_bucket(
1134            'language', hash_bucket_size=20),
1135        feature_column_lib.real_valued_column('age')
1136    ]
1137
1138    regressor = linear.LinearRegressor(
1139        feature_columns=feature_columns,
1140        config=run_config.RunConfig(tf_random_seed=1))
1141
1142    regressor.fit(input_fn=_input_fn, steps=100)
1143
1144    scores = regressor.evaluate(input_fn=_input_fn, steps=1)
1145    self.assertLess(scores['loss'], 0.2)
1146
1147  def testLoss(self):
1148    """Tests loss calculation."""
1149
1150    def _input_fn_train():
1151      # Create 4 rows, one of them (y = x), three of them (y=Not(x))
1152      # The algorithm should learn (y = 0.25).
1153      labels = constant_op.constant([[1.], [0.], [0.], [0.]])
1154      features = {'x': array_ops.ones(shape=[4, 1], dtype=dtypes.float32),}
1155      return features, labels
1156
1157    regressor = linear.LinearRegressor(
1158        feature_columns=[feature_column_lib.real_valued_column('x')],
1159        config=run_config.RunConfig(tf_random_seed=1))
1160
1161    regressor.fit(input_fn=_input_fn_train, steps=100)
1162    scores = regressor.evaluate(input_fn=_input_fn_train, steps=1)
1163    # Average square loss = (0.75^2 + 3*0.25^2) / 4 = 0.1875
1164    self.assertAlmostEqual(0.1875, scores['loss'], delta=0.1)
1165
1166  def testLossWithWeights(self):
1167    """Tests loss calculation with weights."""
1168
1169    def _input_fn_train():
1170      # 4 rows with equal weight, one of them (y = x), three of them (y=Not(x))
1171      # The algorithm should learn (y = 0.25).
1172      labels = constant_op.constant([[1.], [0.], [0.], [0.]])
1173      features = {
1174          'x': array_ops.ones(
1175              shape=[4, 1], dtype=dtypes.float32),
1176          'w': constant_op.constant([[1.], [1.], [1.], [1.]])
1177      }
1178      return features, labels
1179
1180    def _input_fn_eval():
1181      # 4 rows, with different weights.
1182      labels = constant_op.constant([[1.], [0.], [0.], [0.]])
1183      features = {
1184          'x': array_ops.ones(
1185              shape=[4, 1], dtype=dtypes.float32),
1186          'w': constant_op.constant([[7.], [1.], [1.], [1.]])
1187      }
1188      return features, labels
1189
1190    regressor = linear.LinearRegressor(
1191        weight_column_name='w',
1192        feature_columns=[feature_column_lib.real_valued_column('x')],
1193        config=run_config.RunConfig(tf_random_seed=1))
1194
1195    regressor.fit(input_fn=_input_fn_train, steps=100)
1196    scores = regressor.evaluate(input_fn=_input_fn_eval, steps=1)
1197    # Weighted average square loss = (7*0.75^2 + 3*0.25^2) / 10 = 0.4125
1198    self.assertAlmostEqual(0.4125, scores['loss'], delta=0.1)
1199
1200  def testTrainWithWeights(self):
1201    """Tests training with given weight column."""
1202
1203    def _input_fn_train():
1204      # Create 4 rows, one of them (y = x), three of them (y=Not(x))
1205      # First row has more weight than others. Model should fit (y=x) better
1206      # than (y=Not(x)) due to the relative higher weight of the first row.
1207      labels = constant_op.constant([[1.], [0.], [0.], [0.]])
1208      features = {
1209          'x': array_ops.ones(
1210              shape=[4, 1], dtype=dtypes.float32),
1211          'w': constant_op.constant([[100.], [3.], [2.], [2.]])
1212      }
1213      return features, labels
1214
1215    def _input_fn_eval():
1216      # Create 4 rows (y = x)
1217      labels = constant_op.constant([[1.], [1.], [1.], [1.]])
1218      features = {
1219          'x': array_ops.ones(
1220              shape=[4, 1], dtype=dtypes.float32),
1221          'w': constant_op.constant([[1.], [1.], [1.], [1.]])
1222      }
1223      return features, labels
1224
1225    regressor = linear.LinearRegressor(
1226        weight_column_name='w',
1227        feature_columns=[feature_column_lib.real_valued_column('x')],
1228        config=run_config.RunConfig(tf_random_seed=1))
1229
1230    regressor.fit(input_fn=_input_fn_train, steps=100)
1231    scores = regressor.evaluate(input_fn=_input_fn_eval, steps=1)
1232    # The model should learn (y = x) because of the weights, so the loss should
1233    # be close to zero.
1234    self.assertLess(scores['loss'], 0.1)
1235
1236  def testPredict_AsIterableFalse(self):
1237    """Tests predict method with as_iterable=False."""
1238    labels = [1.0, 0., 0.2]
1239
1240    def _input_fn(num_epochs=None):
1241      features = {
1242          'age':
1243              input_lib.limit_epochs(
1244                  constant_op.constant([[0.8], [0.15], [0.]]),
1245                  num_epochs=num_epochs),
1246          'language':
1247              sparse_tensor.SparseTensor(
1248                  values=['en', 'fr', 'zh'],
1249                  indices=[[0, 0], [0, 1], [2, 0]],
1250                  dense_shape=[3, 2])
1251      }
1252      return features, constant_op.constant(labels, dtype=dtypes.float32)
1253
1254    feature_columns = [
1255        feature_column_lib.sparse_column_with_hash_bucket(
1256            'language', hash_bucket_size=20),
1257        feature_column_lib.real_valued_column('age')
1258    ]
1259
1260    regressor = linear.LinearRegressor(
1261        feature_columns=feature_columns,
1262        config=run_config.RunConfig(tf_random_seed=1))
1263
1264    regressor.fit(input_fn=_input_fn, steps=100)
1265
1266    scores = regressor.evaluate(input_fn=_input_fn, steps=1)
1267    self.assertLess(scores['loss'], 0.1)
1268    predicted_scores = regressor.predict_scores(
1269        input_fn=_input_fn, as_iterable=False)
1270    self.assertAllClose(labels, predicted_scores, atol=0.1)
1271    predictions = regressor.predict(input_fn=_input_fn, as_iterable=False)
1272    self.assertAllClose(predicted_scores, predictions)
1273
1274  def testPredict_AsIterable(self):
1275    """Tests predict method with as_iterable=True."""
1276    labels = [1.0, 0., 0.2]
1277
1278    def _input_fn(num_epochs=None):
1279      features = {
1280          'age':
1281              input_lib.limit_epochs(
1282                  constant_op.constant([[0.8], [0.15], [0.]]),
1283                  num_epochs=num_epochs),
1284          'language':
1285              sparse_tensor.SparseTensor(
1286                  values=['en', 'fr', 'zh'],
1287                  indices=[[0, 0], [0, 1], [2, 0]],
1288                  dense_shape=[3, 2])
1289      }
1290      return features, constant_op.constant(labels, dtype=dtypes.float32)
1291
1292    feature_columns = [
1293        feature_column_lib.sparse_column_with_hash_bucket(
1294            'language', hash_bucket_size=20),
1295        feature_column_lib.real_valued_column('age')
1296    ]
1297
1298    regressor = linear.LinearRegressor(
1299        feature_columns=feature_columns,
1300        config=run_config.RunConfig(tf_random_seed=1))
1301
1302    regressor.fit(input_fn=_input_fn, steps=100)
1303
1304    scores = regressor.evaluate(input_fn=_input_fn, steps=1)
1305    self.assertLess(scores['loss'], 0.1)
1306    predict_input_fn = functools.partial(_input_fn, num_epochs=1)
1307    predicted_scores = list(
1308        regressor.predict_scores(
1309            input_fn=predict_input_fn, as_iterable=True))
1310    self.assertAllClose(labels, predicted_scores, atol=0.1)
1311    predictions = list(
1312        regressor.predict(
1313            input_fn=predict_input_fn, as_iterable=True))
1314    self.assertAllClose(predicted_scores, predictions)
1315
1316  def testCustomMetrics(self):
1317    """Tests custom evaluation metrics."""
1318
1319    def _input_fn(num_epochs=None):
1320      # Create 4 rows, one of them (y = x), three of them (y=Not(x))
1321      labels = constant_op.constant([[1.], [0.], [0.], [0.]])
1322      features = {
1323          'x':
1324              input_lib.limit_epochs(
1325                  array_ops.ones(
1326                      shape=[4, 1], dtype=dtypes.float32),
1327                  num_epochs=num_epochs)
1328      }
1329      return features, labels
1330
1331    def _my_metric_op(predictions, labels):
1332      return math_ops.reduce_sum(math_ops.multiply(predictions, labels))
1333
1334    regressor = linear.LinearRegressor(
1335        feature_columns=[feature_column_lib.real_valued_column('x')],
1336        config=run_config.RunConfig(tf_random_seed=1))
1337
1338    regressor.fit(input_fn=_input_fn, steps=100)
1339    scores = regressor.evaluate(
1340        input_fn=_input_fn,
1341        steps=1,
1342        metrics={
1343            'my_error':
1344                MetricSpec(
1345                    metric_fn=metric_ops.streaming_mean_squared_error,
1346                    prediction_key='scores'),
1347            'my_metric':
1348                MetricSpec(
1349                    metric_fn=_my_metric_op, prediction_key='scores')
1350        })
1351    self.assertIn('loss', set(scores.keys()))
1352    self.assertIn('my_error', set(scores.keys()))
1353    self.assertIn('my_metric', set(scores.keys()))
1354    predict_input_fn = functools.partial(_input_fn, num_epochs=1)
1355    predictions = np.array(list(
1356        regressor.predict_scores(input_fn=predict_input_fn)))
1357    self.assertAlmostEqual(
1358        _sklearn.mean_squared_error(np.array([1, 0, 0, 0]), predictions),
1359        scores['my_error'])
1360
1361    # Tests the case where the prediction_key is not "scores".
1362    with self.assertRaisesRegexp(KeyError, 'bad_type'):
1363      regressor.evaluate(
1364          input_fn=_input_fn,
1365          steps=1,
1366          metrics={
1367              'bad_name':
1368                  MetricSpec(
1369                      metric_fn=metric_ops.streaming_auc,
1370                      prediction_key='bad_type')
1371          })
1372
1373    # Tests the case where the 2nd element of the key is not "scores".
1374    with self.assertRaises(KeyError):
1375      regressor.evaluate(
1376          input_fn=_input_fn,
1377          steps=1,
1378          metrics={
1379              ('my_error', 'predictions'):
1380                  metric_ops.streaming_mean_squared_error
1381          })
1382
1383    # Tests the case where the tuple of the key doesn't have 2 elements.
1384    with self.assertRaises(ValueError):
1385      regressor.evaluate(
1386          input_fn=_input_fn,
1387          steps=1,
1388          metrics={
1389              ('bad_length_name', 'scores', 'bad_length'):
1390                  metric_ops.streaming_mean_squared_error
1391          })
1392
1393  def testTrainSaveLoad(self):
1394    """Tests that insures you can save and reload a trained model."""
1395
1396    def _input_fn(num_epochs=None):
1397      features = {
1398          'age':
1399              input_lib.limit_epochs(
1400                  constant_op.constant([[0.8], [0.15], [0.]]),
1401                  num_epochs=num_epochs),
1402          'language':
1403              sparse_tensor.SparseTensor(
1404                  values=['en', 'fr', 'zh'],
1405                  indices=[[0, 0], [0, 1], [2, 0]],
1406                  dense_shape=[3, 2])
1407      }
1408      return features, constant_op.constant(
1409          [1.0, 0., 0.2], dtype=dtypes.float32)
1410
1411    feature_columns = [
1412        feature_column_lib.sparse_column_with_hash_bucket(
1413            'language', hash_bucket_size=20),
1414        feature_column_lib.real_valued_column('age')
1415    ]
1416
1417    model_dir = tempfile.mkdtemp()
1418    regressor = linear.LinearRegressor(
1419        model_dir=model_dir,
1420        feature_columns=feature_columns,
1421        config=run_config.RunConfig(tf_random_seed=1))
1422
1423    regressor.fit(input_fn=_input_fn, steps=100)
1424    predict_input_fn = functools.partial(_input_fn, num_epochs=1)
1425    predictions = list(regressor.predict_scores(input_fn=predict_input_fn))
1426    del regressor
1427
1428    regressor2 = linear.LinearRegressor(
1429        model_dir=model_dir, feature_columns=feature_columns)
1430    predictions2 = list(regressor2.predict_scores(input_fn=predict_input_fn))
1431    self.assertAllClose(predictions, predictions2)
1432
1433  def testTrainWithPartitionedVariables(self):
1434    """Tests training with partitioned variables."""
1435
1436    def _input_fn(num_epochs=None):
1437      features = {
1438          'age':
1439              input_lib.limit_epochs(
1440                  constant_op.constant([[0.8], [0.15], [0.]]),
1441                  num_epochs=num_epochs),
1442          'language':
1443              sparse_tensor.SparseTensor(
1444                  values=['en', 'fr', 'zh'],
1445                  indices=[[0, 0], [0, 1], [2, 0]],
1446                  dense_shape=[3, 2])
1447      }
1448      return features, constant_op.constant(
1449          [1.0, 0., 0.2], dtype=dtypes.float32)
1450
1451    feature_columns = [
1452        # The given hash_bucket_size results in variables larger than the
1453        # default min_slice_size attribute, so the variables are partitioned.
1454        feature_column_lib.sparse_column_with_hash_bucket(
1455            'language', hash_bucket_size=2e7),
1456        feature_column_lib.real_valued_column('age')
1457    ]
1458
1459    tf_config = {
1460        'cluster': {
1461            run_config.TaskType.PS: ['fake_ps_0', 'fake_ps_1']
1462        }
1463    }
1464    with test.mock.patch.dict('os.environ',
1465                              {'TF_CONFIG': json.dumps(tf_config)}):
1466      config = run_config.RunConfig(tf_random_seed=1)
1467      # Because we did not start a distributed cluster, we need to pass an
1468      # empty ClusterSpec, otherwise the device_setter will look for
1469      # distributed jobs, such as "/job:ps" which are not present.
1470      config._cluster_spec = server_lib.ClusterSpec({})
1471
1472    regressor = linear.LinearRegressor(
1473        feature_columns=feature_columns, config=config)
1474
1475    regressor.fit(input_fn=_input_fn, steps=100)
1476
1477    scores = regressor.evaluate(input_fn=_input_fn, steps=1)
1478    self.assertLess(scores['loss'], 0.1)
1479
1480  def testDisableCenteredBias(self):
1481    """Tests that we can disable centered bias."""
1482
1483    def _input_fn(num_epochs=None):
1484      features = {
1485          'age':
1486              input_lib.limit_epochs(
1487                  constant_op.constant([[0.8], [0.15], [0.]]),
1488                  num_epochs=num_epochs),
1489          'language':
1490              sparse_tensor.SparseTensor(
1491                  values=['en', 'fr', 'zh'],
1492                  indices=[[0, 0], [0, 1], [2, 0]],
1493                  dense_shape=[3, 2])
1494      }
1495      return features, constant_op.constant(
1496          [1.0, 0., 0.2], dtype=dtypes.float32)
1497
1498    feature_columns = [
1499        feature_column_lib.sparse_column_with_hash_bucket(
1500            'language', hash_bucket_size=20),
1501        feature_column_lib.real_valued_column('age')
1502    ]
1503
1504    regressor = linear.LinearRegressor(
1505        feature_columns=feature_columns,
1506        enable_centered_bias=False,
1507        config=run_config.RunConfig(tf_random_seed=1))
1508
1509    regressor.fit(input_fn=_input_fn, steps=100)
1510
1511    scores = regressor.evaluate(input_fn=_input_fn, steps=1)
1512    self.assertLess(scores['loss'], 0.1)
1513
1514  def testRecoverWeights(self):
1515    rng = np.random.RandomState(67)
1516    n = 1000
1517    n_weights = 10
1518    bias = 2
1519    x = rng.uniform(-1, 1, (n, n_weights))
1520    weights = 10 * rng.randn(n_weights)
1521    y = np.dot(x, weights)
1522    y += rng.randn(len(x)) * 0.05 + rng.normal(bias, 0.01)
1523    feature_columns = estimator.infer_real_valued_columns_from_input(x)
1524    regressor = linear.LinearRegressor(
1525        feature_columns=feature_columns,
1526        optimizer=ftrl.FtrlOptimizer(learning_rate=0.8))
1527    regressor.fit(x, y, batch_size=64, steps=2000)
1528    self.assertIn('linear//weight', regressor.get_variable_names())
1529    regressor_weights = regressor.get_variable_value('linear//weight')
1530    # Have to flatten weights since they come in (x, 1) shape.
1531    self.assertAllClose(weights, regressor_weights.flatten(), rtol=1)
1532    # TODO(ispir): Disable centered_bias.
1533    # assert abs(bias - regressor.bias_) < 0.1
1534
1535  def testSdcaOptimizerRealValuedLinearFeatures(self):
1536    """Tests LinearRegressor with SDCAOptimizer and real valued features."""
1537    x = [[1.2, 2.0, -1.5], [-2.0, 3.0, -0.5], [1.0, -0.5, 4.0]]
1538    weights = [[3.0], [-1.2], [0.5]]
1539    y = np.dot(x, weights)
1540
1541    def input_fn():
1542      return {
1543          'example_id': constant_op.constant(['1', '2', '3']),
1544          'x': constant_op.constant(x),
1545          'weights': constant_op.constant([[10.0], [10.0], [10.0]])
1546      }, constant_op.constant(y)
1547
1548    x_column = feature_column_lib.real_valued_column('x', dimension=3)
1549    sdca_optimizer = sdca_optimizer_lib.SDCAOptimizer(
1550        example_id_column='example_id')
1551    regressor = linear.LinearRegressor(
1552        feature_columns=[x_column],
1553        weight_column_name='weights',
1554        optimizer=sdca_optimizer)
1555    regressor.fit(input_fn=input_fn, steps=20)
1556    loss = regressor.evaluate(input_fn=input_fn, steps=1)['loss']
1557    self.assertLess(loss, 0.01)
1558    self.assertIn('linear/x/weight', regressor.get_variable_names())
1559    regressor_weights = regressor.get_variable_value('linear/x/weight')
1560    self.assertAllClose(
1561        [w[0] for w in weights], regressor_weights.flatten(), rtol=0.1)
1562
1563  def testSdcaOptimizerMixedFeaturesArbitraryWeights(self):
1564    """Tests LinearRegressor with SDCAOptimizer and a mix of features."""
1565
1566    def input_fn():
1567      return {
1568          'example_id':
1569              constant_op.constant(['1', '2', '3']),
1570          'price':
1571              constant_op.constant([0.6, 0.8, 0.3]),
1572          'sq_footage':
1573              constant_op.constant([[900.0], [700.0], [600.0]]),
1574          'country':
1575              sparse_tensor.SparseTensor(
1576                  values=['IT', 'US', 'GB'],
1577                  indices=[[0, 0], [1, 3], [2, 1]],
1578                  dense_shape=[3, 5]),
1579          'weights':
1580              constant_op.constant([[3.0], [5.0], [7.0]])
1581      }, constant_op.constant([[1.55], [-1.25], [-3.0]])
1582
1583    price = feature_column_lib.real_valued_column('price')
1584    sq_footage_bucket = feature_column_lib.bucketized_column(
1585        feature_column_lib.real_valued_column('sq_footage'),
1586        boundaries=[650.0, 800.0])
1587    country = feature_column_lib.sparse_column_with_hash_bucket(
1588        'country', hash_bucket_size=5)
1589    sq_footage_country = feature_column_lib.crossed_column(
1590        [sq_footage_bucket, country], hash_bucket_size=10)
1591    sdca_optimizer = sdca_optimizer_lib.SDCAOptimizer(
1592        example_id_column='example_id', symmetric_l2_regularization=1.0)
1593    regressor = linear.LinearRegressor(
1594        feature_columns=[price, sq_footage_bucket, country, sq_footage_country],
1595        weight_column_name='weights',
1596        optimizer=sdca_optimizer)
1597    regressor.fit(input_fn=input_fn, steps=20)
1598    loss = regressor.evaluate(input_fn=input_fn, steps=1)['loss']
1599    self.assertLess(loss, 0.05)
1600
1601  def testSdcaOptimizerPartitionedVariables(self):
1602    """Tests LinearRegressor with SDCAOptimizer with partitioned variables."""
1603
1604    def input_fn():
1605      return {
1606          'example_id':
1607              constant_op.constant(['1', '2', '3']),
1608          'price':
1609              constant_op.constant([0.6, 0.8, 0.3]),
1610          'sq_footage':
1611              constant_op.constant([[900.0], [700.0], [600.0]]),
1612          'country':
1613              sparse_tensor.SparseTensor(
1614                  values=['IT', 'US', 'GB'],
1615                  indices=[[0, 0], [1, 3], [2, 1]],
1616                  dense_shape=[3, 5]),
1617          'weights':
1618              constant_op.constant([[3.0], [5.0], [7.0]])
1619      }, constant_op.constant([[1.55], [-1.25], [-3.0]])
1620
1621    price = feature_column_lib.real_valued_column('price')
1622    sq_footage_bucket = feature_column_lib.bucketized_column(
1623        feature_column_lib.real_valued_column('sq_footage'),
1624        boundaries=[650.0, 800.0])
1625    country = feature_column_lib.sparse_column_with_hash_bucket(
1626        'country', hash_bucket_size=5)
1627    sq_footage_country = feature_column_lib.crossed_column(
1628        [sq_footage_bucket, country], hash_bucket_size=10)
1629    sdca_optimizer = sdca_optimizer_lib.SDCAOptimizer(
1630        example_id_column='example_id', symmetric_l2_regularization=1.0,
1631        partitioner=partitioned_variables.fixed_size_partitioner(
1632            num_shards=2, axis=0))
1633    tf_config = {
1634        'cluster': {
1635            run_config.TaskType.PS: ['fake_ps_0', 'fake_ps_1']
1636        }
1637    }
1638    with test.mock.patch.dict('os.environ',
1639                              {'TF_CONFIG': json.dumps(tf_config)}):
1640      config = run_config.RunConfig()
1641      # Because we did not start a distributed cluster, we need to pass an
1642      # empty ClusterSpec, otherwise the device_setter will look for
1643      # distributed jobs, such as "/job:ps" which are not present.
1644      config._cluster_spec = server_lib.ClusterSpec({})
1645
1646    regressor = linear.LinearRegressor(
1647        feature_columns=[price, sq_footage_bucket, country, sq_footage_country],
1648        weight_column_name='weights',
1649        optimizer=sdca_optimizer,
1650        config=config)
1651    regressor.fit(input_fn=input_fn, steps=20)
1652    loss = regressor.evaluate(input_fn=input_fn, steps=1)['loss']
1653    self.assertLess(loss, 0.05)
1654
1655  def testSdcaOptimizerSparseFeaturesWithL1Reg(self):
1656    """Tests LinearClassifier with SDCAOptimizer and sparse features."""
1657
1658    def input_fn():
1659      return {
1660          'example_id':
1661              constant_op.constant(['1', '2', '3']),
1662          'price':
1663              constant_op.constant([[0.4], [0.6], [0.3]]),
1664          'country':
1665              sparse_tensor.SparseTensor(
1666                  values=['IT', 'US', 'GB'],
1667                  indices=[[0, 0], [1, 3], [2, 1]],
1668                  dense_shape=[3, 5]),
1669          'weights':
1670              constant_op.constant([[10.0], [10.0], [10.0]])
1671      }, constant_op.constant([[1.4], [-0.8], [2.6]])
1672
1673    price = feature_column_lib.real_valued_column('price')
1674    country = feature_column_lib.sparse_column_with_hash_bucket(
1675        'country', hash_bucket_size=5)
1676    # Regressor with no L1 regularization.
1677    sdca_optimizer = sdca_optimizer_lib.SDCAOptimizer(
1678        example_id_column='example_id')
1679    regressor = linear.LinearRegressor(
1680        feature_columns=[price, country],
1681        weight_column_name='weights',
1682        optimizer=sdca_optimizer)
1683    regressor.fit(input_fn=input_fn, steps=20)
1684    no_l1_reg_loss = regressor.evaluate(input_fn=input_fn, steps=1)['loss']
1685    variable_names = regressor.get_variable_names()
1686    self.assertIn('linear/price/weight', variable_names)
1687    self.assertIn('linear/country/weights', variable_names)
1688    no_l1_reg_weights = {
1689        'linear/price/weight': regressor.get_variable_value(
1690            'linear/price/weight'),
1691        'linear/country/weights': regressor.get_variable_value(
1692            'linear/country/weights'),
1693    }
1694
1695    # Regressor with L1 regularization.
1696    sdca_optimizer = sdca_optimizer_lib.SDCAOptimizer(
1697        example_id_column='example_id', symmetric_l1_regularization=1.0)
1698    regressor = linear.LinearRegressor(
1699        feature_columns=[price, country],
1700        weight_column_name='weights',
1701        optimizer=sdca_optimizer)
1702    regressor.fit(input_fn=input_fn, steps=20)
1703    l1_reg_loss = regressor.evaluate(input_fn=input_fn, steps=1)['loss']
1704    l1_reg_weights = {
1705        'linear/price/weight': regressor.get_variable_value(
1706            'linear/price/weight'),
1707        'linear/country/weights': regressor.get_variable_value(
1708            'linear/country/weights'),
1709    }
1710
1711    # Unregularized loss is lower when there is no L1 regularization.
1712    self.assertLess(no_l1_reg_loss, l1_reg_loss)
1713    self.assertLess(no_l1_reg_loss, 0.05)
1714
1715    # But weights returned by the regressor with L1 regularization have smaller
1716    # L1 norm.
1717    l1_reg_weights_norm, no_l1_reg_weights_norm = 0.0, 0.0
1718    for var_name in sorted(l1_reg_weights):
1719      l1_reg_weights_norm += sum(
1720          np.absolute(l1_reg_weights[var_name].flatten()))
1721      no_l1_reg_weights_norm += sum(
1722          np.absolute(no_l1_reg_weights[var_name].flatten()))
1723      print('Var name: %s, value: %s' %
1724            (var_name, no_l1_reg_weights[var_name].flatten()))
1725    self.assertLess(l1_reg_weights_norm, no_l1_reg_weights_norm)
1726
1727  def testSdcaOptimizerBiasOnly(self):
1728    """Tests LinearClassifier with SDCAOptimizer and validates bias weight."""
1729
1730    def input_fn():
1731      """Testing the bias weight when it's the only feature present.
1732
1733      All of the instances in this input only have the bias feature, and a
1734      1/4 of the labels are positive. This means that the expected weight for
1735      the bias should be close to the average prediction, i.e 0.25.
1736      Returns:
1737        Training data for the test.
1738      """
1739      num_examples = 40
1740      return {
1741          'example_id':
1742              constant_op.constant([str(x + 1) for x in range(num_examples)]),
1743          # place_holder is an empty column which is always 0 (absent), because
1744          # LinearClassifier requires at least one column.
1745          'place_holder':
1746              constant_op.constant([[0.0]] * num_examples),
1747      }, constant_op.constant(
1748          [[1 if i % 4 == 0 else 0] for i in range(num_examples)])
1749
1750    place_holder = feature_column_lib.real_valued_column('place_holder')
1751    sdca_optimizer = sdca_optimizer_lib.SDCAOptimizer(
1752        example_id_column='example_id')
1753    regressor = linear.LinearRegressor(
1754        feature_columns=[place_holder], optimizer=sdca_optimizer)
1755    regressor.fit(input_fn=input_fn, steps=100)
1756
1757    self.assertNear(
1758        regressor.get_variable_value('linear/bias_weight')[0], 0.25, err=0.1)
1759
1760  def testSdcaOptimizerBiasAndOtherColumns(self):
1761    """Tests LinearClassifier with SDCAOptimizer and validates bias weight."""
1762
1763    def input_fn():
1764      """Testing the bias weight when there are other features present.
1765
1766      1/2 of the instances in this input have feature 'a', the rest have
1767      feature 'b', and we expect the bias to be added to each instance as well.
1768      0.4 of all instances that have feature 'a' are positive, and 0.2 of all
1769      instances that have feature 'b' are positive. The labels in the dataset
1770      are ordered to appear shuffled since SDCA expects shuffled data, and
1771      converges faster with this pseudo-random ordering.
1772      If the bias was centered we would expect the weights to be:
1773      bias: 0.3
1774      a: 0.1
1775      b: -0.1
1776      Until b/29339026 is resolved, the bias gets regularized with the same
1777      global value for the other columns, and so the expected weights get
1778      shifted and are:
1779      bias: 0.2
1780      a: 0.2
1781      b: 0.0
1782      Returns:
1783        The test dataset.
1784      """
1785      num_examples = 200
1786      half = int(num_examples / 2)
1787      return {
1788          'example_id':
1789              constant_op.constant([str(x + 1) for x in range(num_examples)]),
1790          'a':
1791              constant_op.constant([[1]] * int(half) + [[0]] * int(half)),
1792          'b':
1793              constant_op.constant([[0]] * int(half) + [[1]] * int(half)),
1794      }, constant_op.constant(
1795          [[x]
1796           for x in [1, 0, 0, 1, 1, 0, 0, 0, 1, 0] * int(half / 10) +
1797           [0, 1, 0, 0, 0, 0, 0, 0, 1, 0] * int(half / 10)])
1798
1799    sdca_optimizer = sdca_optimizer_lib.SDCAOptimizer(
1800        example_id_column='example_id')
1801    regressor = linear.LinearRegressor(
1802        feature_columns=[
1803            feature_column_lib.real_valued_column('a'),
1804            feature_column_lib.real_valued_column('b')
1805        ],
1806        optimizer=sdca_optimizer)
1807
1808    regressor.fit(input_fn=input_fn, steps=200)
1809
1810    variable_names = regressor.get_variable_names()
1811    self.assertIn('linear/bias_weight', variable_names)
1812    self.assertIn('linear/a/weight', variable_names)
1813    self.assertIn('linear/b/weight', variable_names)
1814    # TODO(b/29339026): Change the expected results to expect a centered bias.
1815    self.assertNear(
1816        regressor.get_variable_value('linear/bias_weight')[0], 0.2, err=0.05)
1817    self.assertNear(
1818        regressor.get_variable_value('linear/a/weight')[0], 0.2, err=0.05)
1819    self.assertNear(
1820        regressor.get_variable_value('linear/b/weight')[0], 0.0, err=0.05)
1821
1822  def testSdcaOptimizerBiasAndOtherColumnsFabricatedCentered(self):
1823    """Tests LinearClassifier with SDCAOptimizer and validates bias weight."""
1824
1825    def input_fn():
1826      """Testing the bias weight when there are other features present.
1827
1828      1/2 of the instances in this input have feature 'a', the rest have
1829      feature 'b', and we expect the bias to be added to each instance as well.
1830      0.1 of all instances that have feature 'a' have a label of 1, and 0.1 of
1831      all instances that have feature 'b' have a label of -1.
1832      We can expect the weights to be:
1833      bias: 0.0
1834      a: 0.1
1835      b: -0.1
1836      Returns:
1837        The test dataset.
1838      """
1839      num_examples = 200
1840      half = int(num_examples / 2)
1841      return {
1842          'example_id':
1843              constant_op.constant([str(x + 1) for x in range(num_examples)]),
1844          'a':
1845              constant_op.constant([[1]] * int(half) + [[0]] * int(half)),
1846          'b':
1847              constant_op.constant([[0]] * int(half) + [[1]] * int(half)),
1848      }, constant_op.constant([[1 if x % 10 == 0 else 0] for x in range(half)] +
1849                              [[-1 if x % 10 == 0 else 0] for x in range(half)])
1850
1851    sdca_optimizer = sdca_optimizer_lib.SDCAOptimizer(
1852        example_id_column='example_id')
1853    regressor = linear.LinearRegressor(
1854        feature_columns=[
1855            feature_column_lib.real_valued_column('a'),
1856            feature_column_lib.real_valued_column('b')
1857        ],
1858        optimizer=sdca_optimizer)
1859
1860    regressor.fit(input_fn=input_fn, steps=100)
1861
1862    variable_names = regressor.get_variable_names()
1863    self.assertIn('linear/bias_weight', variable_names)
1864    self.assertIn('linear/a/weight', variable_names)
1865    self.assertIn('linear/b/weight', variable_names)
1866    self.assertNear(
1867        regressor.get_variable_value('linear/bias_weight')[0], 0.0, err=0.05)
1868    self.assertNear(
1869        regressor.get_variable_value('linear/a/weight')[0], 0.1, err=0.05)
1870    self.assertNear(
1871        regressor.get_variable_value('linear/b/weight')[0], -0.1, err=0.05)
1872
1873
1874class LinearEstimatorTest(test.TestCase):
1875
1876  def testExperimentIntegration(self):
1877    cont_features = [
1878        feature_column_lib.real_valued_column(
1879            'feature', dimension=4)
1880    ]
1881    exp = experiment.Experiment(
1882        estimator=linear.LinearEstimator(feature_columns=cont_features,
1883                                         head=head_lib.regression_head()),
1884        train_input_fn=test_data.iris_input_logistic_fn,
1885        eval_input_fn=test_data.iris_input_logistic_fn)
1886    exp.test()
1887
1888  def testEstimatorContract(self):
1889    estimator_test_utils.assert_estimator_contract(self,
1890                                                   linear.LinearEstimator)
1891
1892  def testLinearRegression(self):
1893    """Tests that loss goes down with training."""
1894
1895    def input_fn():
1896      return {
1897          'age':
1898              constant_op.constant([1]),
1899          'language':
1900              sparse_tensor.SparseTensor(
1901                  values=['english'], indices=[[0, 0]], dense_shape=[1, 1])
1902      }, constant_op.constant([[10.]])
1903
1904    language = feature_column_lib.sparse_column_with_hash_bucket('language',
1905                                                                 100)
1906    age = feature_column_lib.real_valued_column('age')
1907
1908    linear_estimator = linear.LinearEstimator(feature_columns=[age, language],
1909                                              head=head_lib.regression_head())
1910    linear_estimator.fit(input_fn=input_fn, steps=100)
1911    loss1 = linear_estimator.evaluate(input_fn=input_fn, steps=1)['loss']
1912    linear_estimator.fit(input_fn=input_fn, steps=400)
1913    loss2 = linear_estimator.evaluate(input_fn=input_fn, steps=1)['loss']
1914
1915    self.assertLess(loss2, loss1)
1916    self.assertLess(loss2, 0.5)
1917
1918  def testPoissonRegression(self):
1919    """Tests that loss goes down with training."""
1920
1921    def input_fn():
1922      return {
1923          'age':
1924              constant_op.constant([1]),
1925          'language':
1926              sparse_tensor.SparseTensor(
1927                  values=['english'], indices=[[0, 0]], dense_shape=[1, 1])
1928      }, constant_op.constant([[10.]])
1929
1930    language = feature_column_lib.sparse_column_with_hash_bucket('language',
1931                                                                 100)
1932    age = feature_column_lib.real_valued_column('age')
1933
1934    linear_estimator = linear.LinearEstimator(
1935        feature_columns=[age, language],
1936        head=head_lib.poisson_regression_head())
1937    linear_estimator.fit(input_fn=input_fn, steps=10)
1938    loss1 = linear_estimator.evaluate(input_fn=input_fn, steps=1)['loss']
1939    linear_estimator.fit(input_fn=input_fn, steps=100)
1940    loss2 = linear_estimator.evaluate(input_fn=input_fn, steps=1)['loss']
1941
1942    self.assertLess(loss2, loss1)
1943    # Here loss of 2.1 implies a prediction of ~9.9998
1944    self.assertLess(loss2, 2.1)
1945
1946  def testSDCANotSupported(self):
1947    """Tests that we detect error for SDCA."""
1948    maintenance_cost = feature_column_lib.real_valued_column('maintenance_cost')
1949    sq_footage = feature_column_lib.real_valued_column('sq_footage')
1950    sdca_optimizer = sdca_optimizer_lib.SDCAOptimizer(
1951        example_id_column='example_id')
1952    with self.assertRaises(ValueError):
1953      linear.LinearEstimator(
1954          head=head_lib.regression_head(label_dimension=1),
1955          feature_columns=[maintenance_cost, sq_footage],
1956          optimizer=sdca_optimizer,
1957          _joint_weights=True)
1958
1959
1960def boston_input_fn():
1961  boston = base.load_boston()
1962  features = math_ops.cast(
1963      array_ops.reshape(constant_op.constant(boston.data), [-1, 13]),
1964      dtypes.float32)
1965  labels = math_ops.cast(
1966      array_ops.reshape(constant_op.constant(boston.target), [-1, 1]),
1967      dtypes.float32)
1968  return features, labels
1969
1970
1971class FeatureColumnTest(test.TestCase):
1972
1973  def testTrain(self):
1974    feature_columns = estimator.infer_real_valued_columns_from_input_fn(
1975        boston_input_fn)
1976    est = linear.LinearRegressor(feature_columns=feature_columns)
1977    est.fit(input_fn=boston_input_fn, steps=1)
1978    _ = est.evaluate(input_fn=boston_input_fn, steps=1)
1979
1980
1981if __name__ == '__main__':
1982  test.main()
1983