001/*
002 * Copyright (c) 2007-2022 The Cascading Authors. All Rights Reserved.
003 *
004 * Project and contact information: https://cascading.wensel.net/
005 *
006 * This file is part of the Cascading project.
007 *
008 * Licensed under the Apache License, Version 2.0 (the "License");
009 * you may not use this file except in compliance with the License.
010 * You may obtain a copy of the License at
011 *
012 *     http://www.apache.org/licenses/LICENSE-2.0
013 *
014 * Unless required by applicable law or agreed to in writing, software
015 * distributed under the License is distributed on an "AS IS" BASIS,
016 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
017 * See the License for the specific language governing permissions and
018 * limitations under the License.
019 */
020
021package cascading;
022
023import java.io.Serializable;
024import java.util.ArrayList;
025import java.util.Collection;
026import java.util.Collections;
027import java.util.Comparator;
028import java.util.HashMap;
029import java.util.List;
030import java.util.Map;
031import java.util.regex.Pattern;
032
033import cascading.cascade.Cascades;
034import cascading.flow.Flow;
035import cascading.operation.Debug;
036import cascading.operation.Filter;
037import cascading.operation.Function;
038import cascading.operation.Identity;
039import cascading.operation.Insert;
040import cascading.operation.NoOp;
041import cascading.operation.aggregator.Count;
042import cascading.operation.aggregator.First;
043import cascading.operation.expression.ExpressionFunction;
044import cascading.operation.filter.And;
045import cascading.operation.function.UnGroup;
046import cascading.operation.regex.RegexFilter;
047import cascading.operation.regex.RegexParser;
048import cascading.operation.regex.RegexSplitter;
049import cascading.pipe.Each;
050import cascading.pipe.Every;
051import cascading.pipe.GroupBy;
052import cascading.pipe.Merge;
053import cascading.pipe.Pipe;
054import cascading.tap.MultiSourceTap;
055import cascading.tap.SinkMode;
056import cascading.tap.Tap;
057import cascading.tuple.Fields;
058import cascading.tuple.Hasher;
059import cascading.tuple.Tuple;
060import org.junit.Test;
061
062import static cascading.ComparePlatformsTest.NONDETERMINISTIC;
063import static data.InputData.*;
064
065public class FieldedPipesPlatformTest extends PlatformTestCase
066  {
067  public FieldedPipesPlatformTest()
068    {
069    super( true, 5, 3 ); // leave cluster testing enabled
070    }
071
072  @Test
073  public void testSimpleGroup() throws Exception
074    {
075    getPlatform().copyFromLocal( inputFileApache );
076
077    Tap source = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileApache );
078
079    Pipe pipe = new Pipe( "test" );
080
081    pipe = new Each( pipe, new Fields( "line" ), new RegexParser( new Fields( "ip" ), "^[^ ]*" ), new Fields( "ip" ) );
082
083    pipe = new GroupBy( pipe, new Fields( "ip" ) );
084
085    pipe = new Every( pipe, new Count(), new Fields( "ip", "count" ) );
086
087    Tap sink = getPlatform().getTextFile( getOutputPath( "simple" ), SinkMode.REPLACE );
088
089    Flow flow = getPlatform().getFlowConnector().connect( source, sink, pipe );
090
091    flow.complete();
092
093    validateLength( flow.openSource(), 10 ); // validate source, this once, as a sanity check
094    validateLength( flow, 8, null );
095    }
096
097  @Test
098  public void testSimpleChain() throws Exception
099    {
100    getPlatform().copyFromLocal( inputFileApache );
101
102    Tap source = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileApache );
103
104    Pipe pipe = new Pipe( "test" );
105
106    pipe = new Each( pipe, new Fields( "line" ), new RegexParser( new Fields( "ip" ), "^[^ ]*" ), new Fields( "ip" ) );
107
108    pipe = new GroupBy( pipe, new Fields( "ip" ) );
109
110    pipe = new Every( pipe, new Count( new Fields( "count1" ) ) );
111    pipe = new Every( pipe, new Count( new Fields( "count2" ) ) );
112    pipe = new Every( pipe, new Count( new Fields( "count3" ) ) );
113    pipe = new Every( pipe, new Count( new Fields( "count4" ) ) );
114
115    Tap sink = getPlatform().getTabDelimitedFile( Fields.ALL, getOutputPath( "simplechain" ), SinkMode.REPLACE );
116
117    Flow flow = getPlatform().getFlowConnector().connect( source, sink, pipe );
118
119    flow.complete();
120
121    validateLength( flow, 8, 5 );
122    }
123
124  @Test
125  public void testChainEndingWithEach() throws Exception
126    {
127    getPlatform().copyFromLocal( inputFileApache );
128
129    Pipe pipe = new Pipe( "test" );
130
131    pipe = new Each( pipe, new Fields( "line" ), new RegexParser( new Fields( "ip" ), "^[^ ]*" ), new Fields( "ip" ) );
132
133    pipe = new GroupBy( pipe, new Fields( "ip" ) );
134
135    pipe = new Every( pipe, new Count( new Fields( "count1" ) ) );
136    pipe = new Every( pipe, new Count( new Fields( "count2" ) ) );
137
138    pipe = new Each( pipe, new Fields( "count1", "count2" ), new ExpressionFunction( new Fields( "sum" ), "count1 + count2", int.class ), Fields.ALL );
139
140    Tap source = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileApache );
141    Tap sink = getPlatform().getTextFile( getOutputPath( "chaineach" ), SinkMode.REPLACE );
142
143    Flow flow = getPlatform().getFlowConnector().connect( source, sink, pipe );
144
145    flow.complete();
146
147    validateLength( flow, 8, null );
148    }
149
150  // also tests the RegexSplitter
151
152  @Test
153  public void testNoGroup() throws Exception
154    {
155    getPlatform().copyFromLocal( inputFileApache );
156
157    Tap source = getPlatform().getTextFile( new Fields( "line" ), inputFileApache );
158
159    Pipe pipe = new Pipe( "test" );
160
161    pipe = new Each( pipe, new RegexSplitter( "\\s+" ), new Fields( 1 ) );
162
163    Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "nogroup" ), SinkMode.REPLACE );
164
165    Flow flow = getPlatform().getFlowConnector().connect( source, sink, pipe );
166
167    flow.complete();
168
169    validateLength( flow, 10, null );
170
171    List<Tuple> results = getSinkAsList( flow );
172
173    assertTrue( results.contains( new Tuple( "75.185.76.245" ) ) );
174    }
175
176  @Test
177  public void testCopy() throws Exception
178    {
179    getPlatform().copyFromLocal( inputFileApache );
180
181    Tap source = getPlatform().getTextFile( new Fields( "line" ), inputFileApache );
182
183    Pipe pipe = new Pipe( "test" );
184
185    Tap sink = getPlatform().getTextFile( getOutputPath( "copy" ), SinkMode.REPLACE );
186
187    Flow flow = getPlatform().getFlowConnector().connect( source, sink, pipe );
188
189    flow.complete();
190
191    validateLength( flow, 10, null );
192    }
193
194  @Test
195  public void testSimpleMerge() throws Exception
196    {
197    getPlatform().copyFromLocal( inputFileLower );
198    getPlatform().copyFromLocal( inputFileUpper );
199
200    Tap sourceLower = getPlatform().getTextFile( inputFileLower );
201    Tap sourceUpper = getPlatform().getTextFile( inputFileUpper );
202
203    Map sources = new HashMap();
204
205    sources.put( "lower", sourceLower );
206    sources.put( "upper", sourceUpper );
207
208    Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " );
209
210    // using null pos so all fields are written
211    Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "simplemerge" ), SinkMode.REPLACE );
212
213    Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter );
214    Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), splitter );
215
216    Pipe splice = new GroupBy( "merge", Pipe.pipes( pipeLower, pipeUpper ), new Fields( "num" ), null, false );
217
218    Flow flow = getPlatform().getFlowConnector().connect( sources, sink, splice );
219
220    flow.complete();
221
222    validateLength( flow, 10 );
223
224    Collection results = getSinkAsList( flow );
225
226    assertTrue( "missing value", results.contains( new Tuple( "1\ta" ) ) );
227    assertTrue( "missing value", results.contains( new Tuple( "1\tA" ) ) );
228    assertTrue( "missing value", results.contains( new Tuple( "2\tb" ) ) );
229    assertTrue( "missing value", results.contains( new Tuple( "2\tB" ) ) );
230    assertTrue( "missing value", results.contains( new Tuple( "3\tc" ) ) );
231    assertTrue( "missing value", results.contains( new Tuple( "3\tC" ) ) );
232    }
233
234  /**
235   * Specifically tests GroupBy will return the correct grouping fields to the following Every
236   * <p>
237   * additionally tests secondary sorting during merging
238   *
239   * @throws Exception
240   */
241  @Test
242  public void testSimpleMergeThree() throws Exception
243    {
244    getPlatform().copyFromLocal( inputFileLower );
245    getPlatform().copyFromLocal( inputFileUpper );
246    getPlatform().copyFromLocal( inputFileLowerOffset );
247
248    Tap sourceLower = getPlatform().getTextFile( inputFileLower );
249    Tap sourceUpper = getPlatform().getTextFile( inputFileUpper );
250    Tap sourceLowerOffset = getPlatform().getTextFile( inputFileLowerOffset );
251
252    Map sources = new HashMap();
253
254    sources.put( "lower", sourceLower );
255    sources.put( "upper", sourceUpper );
256    sources.put( "offset", sourceLowerOffset );
257
258    Tap sink = getPlatform().getDelimitedFile( Fields.ALL, "\t", getOutputPath( "simplemergethree" ), SinkMode.REPLACE );
259
260    Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " );
261
262    Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter );
263    Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), splitter );
264    Pipe pipeOffset = new Each( new Pipe( "offset" ), new Fields( "line" ), splitter );
265
266    Pipe splice = new GroupBy( "merge", Pipe.pipes( pipeLower, pipeUpper, pipeOffset ), new Fields( "num" ), new Fields( "char" ) );
267
268    splice = new Every( splice, new Fields( "char" ), new First( new Fields( "first" ) ) );
269
270    splice = new Each( splice, new Fields( "num", "first" ), new Identity() );
271
272    Flow flow = getPlatform().getFlowConnector().connect( sources, sink, splice );
273
274    flow.complete();
275
276    validateLength( flow, 6 );
277
278    List<Tuple> tuples = getSinkAsList( flow );
279
280    assertTrue( tuples.contains( new Tuple( "1", "A" ) ) );
281    assertTrue( tuples.contains( new Tuple( "2", "B" ) ) );
282    assertTrue( tuples.contains( new Tuple( "3", "C" ) ) );
283    assertTrue( tuples.contains( new Tuple( "4", "D" ) ) );
284    assertTrue( tuples.contains( new Tuple( "5", "E" ) ) );
285    assertTrue( tuples.contains( new Tuple( "6", "c" ) ) );
286    }
287
288  @Test
289  public void testSameSourceMerge() throws Exception
290    {
291    getPlatform().copyFromLocal( inputFileLower );
292
293    Tap sourceLower = getPlatform().getDelimitedFile( new Fields( "num", "char" ), " ", inputFileLower );
294
295    Map sources = new HashMap();
296
297    sources.put( "lower", sourceLower );
298    sources.put( "upper", sourceLower );
299
300    // using null pos so all fields are written
301    Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath(), SinkMode.REPLACE );
302
303    Pipe pipeLower = new Pipe( "lower" );
304    Pipe pipeUpper = new Pipe( "upper" );
305
306    Pipe splice = new GroupBy( "merge", Pipe.pipes( pipeLower, pipeUpper ), new Fields( "num" ), null, false );
307
308    Flow flow = getPlatform().getFlowConnector().connect( sources, sink, splice );
309
310    flow.complete();
311
312    validateLength( flow, 10 );
313
314    Collection results = getSinkAsList( flow );
315
316    assertEquals( "missing value", 2, Collections.frequency( results, new Tuple( "1\ta" ) ) );
317    assertEquals( "missing value", 2, Collections.frequency( results, new Tuple( "2\tb" ) ) );
318    assertEquals( "missing value", 2, Collections.frequency( results, new Tuple( "3\tc" ) ) );
319    }
320
321  /**
322   * same test as MergePipesTest, but to test that chained groupby don't exhibit similar failures
323   *
324   * @throws Exception
325   */
326  @Test
327  public void testSameSourceMergeThreeChainGroup() throws Exception
328    {
329    getPlatform().copyFromLocal( inputFileLower );
330
331    Tap sourceLower = getPlatform().getTextFile( inputFileLower );
332
333    Map sources = new HashMap();
334
335    sources.put( "split", sourceLower );
336
337    Tap sink = getPlatform().getTextFile( getOutputPath( "samemergethreechaingroup" ), SinkMode.REPLACE );
338
339    Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " );
340
341    Pipe pipe = new Pipe( "split" );
342
343    Pipe pipeLower = new Each( new Pipe( "lower", pipe ), new Fields( "line" ), splitter );
344    Pipe pipeUpper = new Each( new Pipe( "upper", pipe ), new Fields( "line" ), splitter );
345    Pipe pipeOffset = new Each( new Pipe( "offset", pipe ), new Fields( "line" ), splitter );
346
347    //put group before merge to test path counts
348    Pipe splice = new GroupBy( Pipe.pipes( pipeLower, pipeUpper ), new Fields( "num" ) );
349
350    // this group has its incoming paths counted, gated by the previous group
351    splice = new GroupBy( Pipe.pipes( splice, pipeOffset ), new Fields( "num" ) );
352
353    Flow flow = getPlatform().getFlowConnector().connect( sources, sink, splice );
354
355    if( getPlatform().isMapReduce() )
356      assertEquals( "wrong num jobs", 2, flow.getFlowSteps().size() );
357
358    flow.complete();
359
360    validateLength( flow, 15 );
361    }
362
363  @Test
364  public void testUnGroup() throws Exception
365    {
366    getPlatform().copyFromLocal( inputFileJoined );
367
368    Tap source = getPlatform().getTextFile( inputFileJoined );
369    Tap sink = getPlatform().getTextFile( getOutputPath( "ungrouped" ), SinkMode.REPLACE );
370
371    Pipe pipe = new Pipe( "test" );
372
373    pipe = new Each( pipe, new Fields( "line" ), new RegexSplitter( new Fields( "num", "lower", "upper" ) ) );
374
375    pipe = new Each( pipe, new UnGroup( new Fields( "num", "char" ), new Fields( "num" ), Fields.fields( new Fields( "lower" ), new Fields( "upper" ) ) ) );
376
377    Flow flow = getPlatform().getFlowConnector().connect( source, sink, pipe );
378
379    flow.complete();
380
381    validateLength( flow, 10 );
382    }
383
384  @Test
385  public void testUnGroupAnon() throws Exception
386    {
387    getPlatform().copyFromLocal( inputFileJoined );
388
389    Tap source = getPlatform().getTextFile( inputFileJoined );
390    Tap sink = getPlatform().getTextFile( getOutputPath( "ungroupedanon" ), SinkMode.REPLACE );
391
392    Pipe pipe = new Pipe( "test" );
393
394    pipe = new Each( pipe, new Fields( "line" ), new RegexSplitter( new Fields( "num", "lower", "upper" ) ) );
395
396    pipe = new Each( pipe, new UnGroup( new Fields( "num" ), Fields.fields( new Fields( "lower" ), new Fields( "upper" ) ) ) );
397
398    Flow flow = getPlatform().getFlowConnector().connect( source, sink, pipe );
399
400    flow.complete();
401
402    validateLength( flow, 10 );
403    }
404
405  @Test
406  public void testUnGroupBySize() throws Exception
407    {
408    getPlatform().copyFromLocal( inputFileJoinedExtra );
409
410    Tap source = getPlatform().getTextFile( inputFileJoinedExtra );
411    Tap sink = getPlatform().getTextFile( getOutputPath( "ungrouped_size" ), SinkMode.REPLACE );
412
413    Pipe pipe = new Pipe( "test" );
414
415    pipe = new Each( pipe, new Fields( "line" ), new RegexSplitter( new Fields( "num1", "num2", "lower", "upper" ) ) );
416
417    pipe = new Each( pipe, new UnGroup( new Fields( "num1", "num2", "char" ), new Fields( "num1", "num2" ), 1 ) );
418
419    Flow flow = getPlatform().getFlowConnector().connect( source, sink, pipe );
420
421    flow.complete();
422
423    List<Tuple> tuples = asList( flow, sink );
424    assertEquals( 10, tuples.size() );
425
426    List<Object> values = new ArrayList<Object>();
427    for( Tuple tuple : tuples )
428      values.add( tuple.getObject( 1 ) );
429
430    assertTrue( values.contains( "1\t1\ta" ) );
431    assertTrue( values.contains( "1\t1\tA" ) );
432    assertTrue( values.contains( "2\t2\tb" ) );
433    assertTrue( values.contains( "2\t2\tB" ) );
434    assertTrue( values.contains( "3\t3\tc" ) );
435    assertTrue( values.contains( "3\t3\tC" ) );
436    assertTrue( values.contains( "4\t4\td" ) );
437    assertTrue( values.contains( "4\t4\tD" ) );
438    assertTrue( values.contains( "5\t5\te" ) );
439    assertTrue( values.contains( "5\t5\tE" ) );
440    }
441
442  @Test
443  public void testFilter() throws Exception
444    {
445    getPlatform().copyFromLocal( inputFileApache );
446
447    Tap source = getPlatform().getTextFile( inputFileApache );
448    Tap sink = getPlatform().getTextFile( getOutputPath( "filter" ), SinkMode.REPLACE );
449
450    Pipe pipe = new Pipe( "test" );
451
452    Filter filter = new RegexFilter( "^68.*" );
453
454    pipe = new Each( pipe, new Fields( "line" ), filter );
455
456    Flow flow = getPlatform().getFlowConnector().connect( source, sink, pipe );
457
458    flow.complete();
459
460    validateLength( flow, 3 );
461    }
462
463  @Test
464  public void testLogicFilter() throws Exception
465    {
466    getPlatform().copyFromLocal( inputFileApache );
467
468    Tap source = getPlatform().getTextFile( inputFileApache );
469    Tap sink = getPlatform().getTextFile( getOutputPath( "logicfilter" ), SinkMode.REPLACE );
470
471    Pipe pipe = new Pipe( "test" );
472
473    Filter filter = new And( new RegexFilter( "^68.*$" ), new RegexFilter( "^1000.*$" ) );
474
475    pipe = new Each( pipe, new Fields( "line" ), filter );
476
477    Flow flow = getPlatform().getFlowConnector().connect( source, sink, pipe );
478
479    flow.complete();
480
481    validateLength( flow, 3 );
482    }
483
484  @Test
485  public void testFilterComplex() throws Exception
486    {
487    getPlatform().copyFromLocal( inputFileApache );
488
489    Tap source = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileApache );
490    Tap sink = getPlatform().getTextFile( getOutputPath( "filtercomplex" ), SinkMode.REPLACE );
491
492    Pipe pipe = new Pipe( "test" );
493
494    pipe = new Each( pipe, new Fields( "line" ), TestConstants.APACHE_COMMON_PARSER );
495
496    pipe = new Each( pipe, new Fields( "method" ), new RegexFilter( "^POST" ) );
497    pipe = new Each( pipe, new Fields( "method" ), new RegexFilter( "^POST" ) );
498
499    pipe = new Each( pipe, new Fields( "method" ), new Identity( new Fields( "value" ) ), Fields.ALL );
500
501    pipe = new GroupBy( pipe, new Fields( "value" ) );
502
503    pipe = new Every( pipe, new Count(), new Fields( "value", "count" ) );
504
505    Flow flow = getPlatform().getFlowConnector().connect( source, sink, pipe );
506
507    flow.complete();
508
509    validateLength( flow, 1, null );
510    }
511
512  /**
513   * Intentionally filters all values out to test next mr job behaves
514   *
515   * @throws Exception
516   */
517  @Test
518  public void testFilterAll() throws Exception
519    {
520    getPlatform().copyFromLocal( inputFileApache );
521
522    Tap source = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileApache );
523    Tap sink = getPlatform().getTextFile( getOutputPath( "filterall" ), SinkMode.REPLACE );
524
525    Pipe pipe = new Pipe( "test" );
526
527    String regex = "^([^ ]*) +[^ ]* +[^ ]* +\\[([^]]*)\\] +\\\"([^ ]*) ([^ ]*) [^ ]*\\\" ([^ ]*) ([^ ]*).*$";
528    Fields fieldDeclaration = new Fields( "ip", "time", "method", "event", "status", "size" );
529    int[] groups = {1, 2, 3, 4, 5, 6};
530    RegexParser function = new RegexParser( fieldDeclaration, regex, groups );
531    pipe = new Each( pipe, new Fields( "line" ), function );
532
533    pipe = new Each( pipe, new Fields( "method" ), new RegexFilter( "^fobar" ) ); // intentionally filtering all
534
535    pipe = new GroupBy( pipe, new Fields( "method" ) );
536
537    pipe = new Each( pipe, new Fields( "method" ), new Identity( new Fields( "value" ) ), Fields.ALL );
538
539    pipe = new GroupBy( pipe, new Fields( "value" ) );
540
541    pipe = new Every( pipe, new Count(), new Fields( "value", "count" ) );
542
543    Flow flow = getPlatform().getFlowConnector().connect( source, sink, pipe );
544
545    flow.complete();
546
547    validateLength( flow, 0, null );
548    }
549
550//  public void testLimitFilter() throws Exception
551//    {
552//    copyFromLocal( inputFileApache );
553//
554//    Tap source = new Hfs( new TextLine( new Fields( "offset", "line" ) ), inputFileApache );
555//    Tap sink = new Lfs( new TextLine(), outputPath + "/limitfilter", true );
556//
557//    Pipe pipe = new Pipe( "test" );
558//
559//    Filter filter = new Limit( 7 );
560//
561//    pipe = new Each( pipe, new Fields( "line" ), filter );
562//
563//    Flow flow = new FlowConnector( getProperties() ).connect( source, sink, pipe );
564//
565////    flow.writeDOT( "flow.dot" );
566//
567//    flow.complete();
568//
569//    validateLength( flow, 7, null );
570//    }
571
572  //
573
574  /*
575   *
576   * TODO: create (optional) Tez rule to consolidate into a single DAG. currently renders to two DAGs, one for each side
577   *
578   */
579  @Test
580  public void testSplit() throws Exception
581    {
582    getPlatform().copyFromLocal( inputFileApache );
583
584    // 46 192
585
586    Tap source = getPlatform().getTextFile( inputFileApache );
587    Tap sink1 = getPlatform().getTextFile( getOutputPath( "split1" ), SinkMode.REPLACE );
588    Tap sink2 = getPlatform().getTextFile( getOutputPath( "split2" ), SinkMode.REPLACE );
589
590    Pipe pipe = new Pipe( "split" );
591
592    pipe = new Each( pipe, new Fields( "line" ), new RegexFilter( "^68.*" ) );
593
594    Pipe left = new Each( new Pipe( "left", pipe ), new Fields( "line" ), new RegexFilter( ".*46.*" ) );
595    Pipe right = new Each( new Pipe( "right", pipe ), new Fields( "line" ), new RegexFilter( ".*102.*" ) );
596
597    Map sources = new HashMap();
598    sources.put( "split", source );
599
600    Map sinks = new HashMap();
601    sinks.put( "left", sink1 );
602    sinks.put( "right", sink2 );
603
604    Flow flow = getPlatform().getFlowConnector().connect( sources, sinks, left, right );
605
606    flow.complete();
607
608    validateLength( flow, 1, "left" );
609    validateLength( flow, 2, "right" );
610    }
611
612  /**
613   * verifies non-safe rules apply in the proper place
614   *
615   * @throws Exception
616   */
617  @Test
618  public void testSplitNonSafe() throws Exception
619    {
620    getPlatform().copyFromLocal( inputFileApache );
621
622    // 46 192
623
624    Tap source = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileApache );
625    Tap sink1 = getPlatform().getTextFile( getOutputPath( "nonsafesplit1" ), SinkMode.REPLACE );
626    Tap sink2 = getPlatform().getTextFile( getOutputPath( "nonsafesplit2" ), SinkMode.REPLACE );
627
628    Pipe pipe = new Pipe( "split" );
629
630    // run job on non-safe operation, forces 3 mr jobs.
631    pipe = new Each( pipe, new TestFunction( new Fields( "ignore" ), new Tuple( 1 ), false ), new Fields( "line" ) );
632
633    pipe = new Each( pipe, new Fields( "line" ), new RegexFilter( "^68.*" ) );
634
635    Pipe left = new Each( new Pipe( "left", pipe ), new Fields( "line" ), new RegexFilter( ".*46.*" ) );
636    Pipe right = new Each( new Pipe( "right", pipe ), new Fields( "line" ), new RegexFilter( ".*102.*" ) );
637
638    Map sources = new HashMap();
639    sources.put( "split", source );
640
641    Map sinks = new HashMap();
642    sinks.put( "left", sink1 );
643    sinks.put( "right", sink2 );
644
645    Flow flow = getPlatform().getFlowConnector().connect( sources, sinks, left, right );
646
647    flow.complete();
648
649    validateLength( flow, 1, "left" );
650    validateLength( flow, 2, "right" );
651    }
652
653  @Test
654  public void testSplitSameSourceMerged() throws Exception
655    {
656    getPlatform().copyFromLocal( inputFileApache );
657
658    // 46 192
659
660    Tap source = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileApache );
661    Tap sink = getPlatform().getTextFile( getOutputPath( "splitsourcemerged" ), SinkMode.REPLACE );
662
663    Pipe pipe = new Pipe( "split" );
664
665    pipe = new Each( pipe, new Fields( "line" ), new RegexFilter( "^68.*" ) );
666
667    Pipe left = new Each( new Pipe( "left", pipe ), new Fields( "line" ), new RegexFilter( ".*46.*" ) );
668    Pipe right = new Each( new Pipe( "right", pipe ), new Fields( "line" ), new RegexFilter( ".*102.*" ) );
669
670    Pipe merged = new GroupBy( "merged", Pipe.pipes( left, right ), new Fields( "line" ) );
671
672    Flow flow = getPlatform().getFlowConnector().connect( source, sink, merged );
673
674    flow.complete();
675
676    validateLength( flow, 3 );
677    }
678
679  /**
680   * verifies not inserting Identity between groups works
681   *
682   * @throws Exception
683   */
684  @Test
685  public void testSplitOut() throws Exception
686    {
687    getPlatform().copyFromLocal( inputFileApache );
688
689    Tap sourceLower = getPlatform().getTextFile( new Fields( "num", "line" ), inputFileApache );
690
691    Map sources = new HashMap();
692
693    sources.put( "lower1", sourceLower );
694
695    // using null pos so all fields are written
696    Tap sink1 = getPlatform().getTextFile( getOutputPath( "splitout1" ), SinkMode.REPLACE );
697    Tap sink2 = getPlatform().getTextFile( getOutputPath( "splitout2" ), SinkMode.REPLACE );
698
699    Map sinks = new HashMap();
700
701    sinks.put( "output1", sink1 );
702    sinks.put( "output2", sink2 );
703
704    Pipe pipeLower1 = new Pipe( "lower1" );
705
706    Pipe left = new GroupBy( "output1", pipeLower1, new Fields( 0 ) );
707    Pipe right = new GroupBy( "output2", left, new Fields( 0 ) );
708
709    Flow flow = getPlatform().getFlowConnector().connect( sources, sinks, Pipe.pipes( left, right ) );
710
711//    flow.writeDOT( "spit.dot" );
712
713    flow.complete();
714
715    validateLength( flow, 10, "output1" );
716    validateLength( flow, 10, "output2" );
717
718    assertEquals( 10, asSet( flow, sink1 ).size() );
719    assertEquals( 10, asSet( flow, sink2 ).size() );
720    }
721
722  @Test
723  public void testSplitComplex() throws Exception
724    {
725    getPlatform().copyFromLocal( inputFileApache );
726
727    // 46 192
728
729    Tap source = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileApache );
730    Tap sink1 = getPlatform().getTextFile( getOutputPath( "splitcomp1" ), SinkMode.REPLACE );
731    Tap sink2 = getPlatform().getTextFile( getOutputPath( "splitcomp2" ), SinkMode.REPLACE );
732
733    Pipe pipe = new Pipe( "split" );
734
735    pipe = new Each( pipe, new Fields( "line" ), new RegexParser( new Fields( "ip" ), "^[^ ]*" ), new Fields( "ip" ) );
736
737    pipe = new GroupBy( pipe, new Fields( "ip" ) );
738
739    pipe = new Every( pipe, new Fields( "ip" ), new Count(), new Fields( "ip", "count" ) );
740
741    pipe = new Each( pipe, new Fields( "ip" ), new RegexFilter( "^68.*" ) );
742
743    Pipe left = new Each( new Pipe( "left", pipe ), new Fields( "ip" ), new RegexFilter( ".*46.*" ) );
744
745    Pipe right = new Each( new Pipe( "right", pipe ), new Fields( "ip" ), new RegexFilter( ".*102.*" ) );
746
747    Map sources = Cascades.tapsMap( "split", source );
748    Map sinks = Cascades.tapsMap( Pipe.pipes( left, right ), Tap.taps( sink1, sink2 ) );
749
750    Flow flow = getPlatform().getFlowConnector().connect( sources, sinks, left, right );
751
752    flow.complete();
753
754    validateLength( flow, 1, "left" );
755    validateLength( flow, 1, "right" );
756    }
757
758  @Test
759  public void testSplitMultiple() throws Exception
760    {
761    getPlatform().copyFromLocal( inputFileApache );
762
763    // 46 192
764
765    Tap source = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileApache );
766    Tap sinkLeft = getPlatform().getTextFile( getOutputPath( "left" ), SinkMode.REPLACE );
767    Tap sinkRightLeft = getPlatform().getTextFile( getOutputPath( "rightleft" ), SinkMode.REPLACE );
768    Tap sinkRightRight = getPlatform().getTextFile( getOutputPath( "rightright" ), SinkMode.REPLACE );
769
770    Pipe head = new Pipe( "split" );
771
772    head = new Each( head, new Fields( "line" ), new RegexParser( new Fields( "ip" ), "^[^ ]*" ), new Fields( "ip" ) );
773
774    head = new GroupBy( head, new Fields( "ip" ) );
775
776    head = new Every( head, new Fields( "ip" ), new Count(), new Fields( "ip", "count" ) );
777
778    head = new Each( head, new Fields( "ip" ), new RegexFilter( "^68.*" ) );
779
780    Pipe left = new Each( new Pipe( "left", head ), new Fields( "ip" ), new RegexFilter( ".*46.*" ) );
781
782    Pipe right = new Each( new Pipe( "right", head ), new Fields( "ip" ), new RegexFilter( ".*102.*" ) );
783
784    right = new GroupBy( right, new Fields( "ip" ) );
785
786    Pipe rightLeft = new Each( new Pipe( "rightLeft", right ), new Fields( "ip" ), new Identity() );
787
788    Pipe rightRight = new Each( new Pipe( "rightRight", right ), new Fields( "ip" ), new Identity() );
789
790    Map sources = Cascades.tapsMap( "split", source );
791    Map sinks = Cascades.tapsMap( Pipe.pipes( left, rightLeft, rightRight ), Tap.taps( sinkLeft, sinkRightLeft, sinkRightRight ) );
792
793    Flow flow = getPlatform().getFlowConnector().connect( sources, sinks, left, rightLeft, rightRight );
794
795    flow.complete();
796
797    validateLength( flow, 1, "left" );
798    validateLength( flow, 1, "rightLeft" );
799    validateLength( flow, 1, "rightRight" );
800    }
801
802  @Test
803  public void testConcatenation() throws Exception
804    {
805    getPlatform().copyFromLocal( inputFileLower );
806    getPlatform().copyFromLocal( inputFileUpper );
807
808    Tap sourceLower = getPlatform().getTextFile( inputFileLower );
809    Tap sourceUpper = getPlatform().getTextFile( inputFileUpper );
810
811    Tap source = new MultiSourceTap( sourceLower, sourceUpper );
812
813    Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " );
814
815    // using null pos so all fields are written
816    Tap sink = getPlatform().getTextFile( getOutputPath( "complexconcat" ), SinkMode.REPLACE );
817
818    Pipe pipe = new Each( new Pipe( "concat" ), new Fields( "line" ), splitter );
819
820    Pipe splice = new GroupBy( pipe, new Fields( "num" ) );
821
822    Flow countFlow = getPlatform().getFlowConnector().connect( source, sink, splice );
823
824    countFlow.complete();
825
826    validateLength( countFlow, 10, null );
827    }
828
829  @Test
830  public void testGeneratorAggregator() throws Exception
831    {
832    getPlatform().copyFromLocal( inputFileApache );
833
834    Tap source = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileApache );
835
836    Pipe pipe = new Pipe( "test" );
837
838    pipe = new Each( pipe, new Fields( "line" ), new RegexParser( new Fields( "ip" ), "^[^ ]*" ), new Fields( "ip" ) );
839
840    pipe = new GroupBy( pipe, new Fields( "ip" ) );
841
842    pipe = new Every( pipe, new TestAggregator( new Fields( "count1" ), new Fields( "ip" ), new Tuple( "first1" ), new Tuple( "first2" ) ) );
843    pipe = new Every( pipe, new TestAggregator( new Fields( "count2" ), new Fields( "ip" ), new Tuple( "second" ), new Tuple( "second2" ), new Tuple( "second3" ) ) );
844
845    Tap sink = getPlatform().getTextFile( getOutputPath( "generatoraggregator" ), SinkMode.REPLACE );
846
847    Flow flow = getPlatform().getFlowConnector().connect( source, sink, pipe );
848
849    flow.complete();
850
851    validateLength( flow, 8 * 2 * 3, null );
852    }
853
854  @Test
855  public void testReplace() throws Exception
856    {
857    Tap source = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileApache );
858    Tap sink = getPlatform().getTextFile( new Fields( "offset", "line" ), new Fields( "offset", "line" ), getOutputPath( "replace" ), SinkMode.REPLACE );
859
860    Pipe pipe = new Pipe( "test" );
861
862    Function parser = new RegexParser( new Fields( 0 ), "^[^ ]*" );
863    pipe = new Each( pipe, new Fields( "line" ), parser, Fields.REPLACE );
864    pipe = new Each( pipe, new Fields( "line" ), new Identity( Fields.ARGS ), Fields.REPLACE );
865    pipe = new Each( pipe, new Fields( "line" ), new Identity( new Fields( "line" ) ), Fields.REPLACE );
866
867    pipe = new Each( pipe, new Debug( true ) );
868
869    Flow flow = getPlatform().getFlowConnector( disableDebug() ).connect( source, sink, pipe );
870
871    flow.complete();
872
873    validateLength( flow, 10, 2, Pattern.compile( "^\\d+\\s\\d+\\s[\\d]{1,3}\\.[\\d]{1,3}\\.[\\d]{1,3}\\.[\\d]{1,3}$" ) );
874    }
875
876  @Test
877  public void testSwap() throws Exception
878    {
879    Tap source = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileApache );
880    Tap sink = getPlatform().getTextFile( new Fields( "offset", "line" ), new Fields( "count", "ipaddress" ), getOutputPath( "swap" ), SinkMode.REPLACE );
881
882    Pipe pipe = new Pipe( "test" );
883
884    Function parser = new RegexParser( new Fields( "ip" ), "^[^ ]*" );
885    pipe = new Each( pipe, new Fields( "line" ), parser, Fields.SWAP );
886    pipe = new GroupBy( pipe, new Fields( "ip" ) );
887    pipe = new Every( pipe, new Fields( "ip" ), new Count( new Fields( "count" ) ) );
888    pipe = new Each( pipe, new Fields( "ip" ), new Identity( new Fields( "ipaddress" ) ), Fields.SWAP );
889
890    Flow flow = getPlatform().getFlowConnector().connect( source, sink, pipe );
891
892    flow.complete();
893
894    validateLength( flow, 8, 2, Pattern.compile( "^\\d+\\s\\d+\\s[\\d]{1,3}\\.[\\d]{1,3}\\.[\\d]{1,3}\\.[\\d]{1,3}$" ) );
895    }
896
897  @Test
898  public void testNone() throws Exception
899    {
900    Tap source = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileApache );
901    Tap sink = getPlatform().getTextFile( new Fields( "offset", "line" ), new Fields( "count", "ip" ), getOutputPath( "none" ), SinkMode.REPLACE );
902
903    Pipe pipe = new Pipe( "test" );
904
905    Function parser = new RegexParser( new Fields( "ip" ), "^[^ ]*" );
906    pipe = new Each( pipe, new Fields( "line" ), parser, Fields.ALL );
907    pipe = new Each( pipe, new Fields( "line" ), new NoOp(), Fields.SWAP ); // declares Fields.NONE
908    pipe = new GroupBy( pipe, new Fields( "ip" ) );
909    pipe = new Every( pipe, new Fields( "ip" ), new Count( new Fields( "count" ) ) );
910    pipe = new Each( pipe, Fields.NONE, new Insert( new Fields( "ipaddress" ), "1.2.3.4" ), Fields.ALL );
911
912    Flow flow = getPlatform().getFlowConnector().connect( source, sink, pipe );
913
914    flow.complete();
915
916    validateLength( flow, 8, 2, Pattern.compile( "^\\d+\\s\\d+\\s[\\d]{1,3}\\.[\\d]{1,3}\\.[\\d]{1,3}\\.[\\d]{1,3}$" ) );
917    }
918
919  /**
920   * this tests a merge on two pipes with the same source and name.
921   *
922   * @throws Exception
923   */
924  @Test
925  public void testSplitSameSourceMergedSameName() throws Exception
926    {
927    getPlatform().copyFromLocal( inputFileApache );
928
929    // 46 192
930
931    Tap source = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileApache );
932    Tap sink = getPlatform().getTextFile( getOutputPath( "splitsourcemergedsamename" ), SinkMode.REPLACE );
933
934    Pipe pipe = new Pipe( "split" );
935
936    pipe = new Each( pipe, new Fields( "line" ), new RegexFilter( "^68.*" ) );
937
938    Pipe left = new Each( pipe, new Fields( "line" ), new RegexFilter( ".*46.*" ) );
939    Pipe right = new Each( pipe, new Fields( "line" ), new RegexFilter( ".*102.*" ) );
940
941    Pipe merged = new GroupBy( "merged", Pipe.pipes( left, right ), new Fields( "line" ) );
942
943    Flow flow = getPlatform().getFlowConnector().connect( source, sink, merged );
944
945    flow.complete();
946
947    validateLength( flow, 3 );
948    }
949
950  /**
951   * Catches failure to properly resolve the grouping fields as incoming to the second group-by
952   *
953   * @throws Exception
954   */
955  @Test
956  public void testGroupGroup() throws Exception
957    {
958    getPlatform().copyFromLocal( inputFileApache );
959
960    Tap source = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileApache );
961
962    Pipe pipe = new Pipe( "test" );
963
964    pipe = new Each( pipe, new Fields( "line" ), new RegexParser( new Fields( "ip", String.class ), "^[^ ]*" ), new Fields( "ip" ) );
965
966    pipe = new GroupBy( pipe, new Fields( "ip" ) );
967
968    pipe = new Every( pipe, new Count(), new Fields( "ip", "count" ) );
969
970    pipe = new GroupBy( pipe, new Fields( "ip" ), new Fields( "count" ) );
971
972    Tap sink = getPlatform().getTextFile( getOutputPath( "groupgroup" ), SinkMode.REPLACE );
973
974    Map<Object, Object> properties = getProperties();
975
976    properties.put( "cascading.serialization.types.required", "true" );
977
978    Flow flow = getPlatform().getFlowConnector( properties ).connect( source, sink, pipe );
979
980    flow.complete();
981
982    validateLength( flow, 8, null );
983    }
984
985  public static class LowerComparator implements Comparator<Comparable>, Hasher<Comparable>, Serializable
986    {
987    @Override
988    public int compare( Comparable lhs, Comparable rhs )
989      {
990      return lhs.toString().toLowerCase().compareTo( rhs.toString().toLowerCase() );
991      }
992
993    @Override
994    public int hashCode( Comparable value )
995      {
996      if( value == null )
997        return 0;
998
999      return value.toString().toLowerCase().hashCode();
1000      }
1001    }
1002
1003  @Test
1004  public void testGroupByInsensitive() throws Exception
1005    {
1006    getPlatform().copyFromLocal( inputFileLower );
1007    getPlatform().copyFromLocal( inputFileUpper );
1008
1009    Tap sourceLower = getPlatform().getDelimitedFile( new Fields( "num", "char" ), " ", inputFileLower );
1010    Tap sourceUpper = getPlatform().getDelimitedFile( new Fields( "num", "char" ), " ", inputFileUpper );
1011
1012    Map sources = new HashMap();
1013
1014    sources.put( "lower", sourceLower );
1015    sources.put( "upper", sourceUpper );
1016
1017    Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "insensitivegrouping" + NONDETERMINISTIC ), SinkMode.REPLACE );
1018
1019    Pipe pipeLower = new Pipe( "lower" );
1020    Pipe pipeUpper = new Pipe( "upper" );
1021
1022    Pipe merge = new Merge( pipeLower, pipeUpper );
1023
1024    Fields charFields = new Fields( "char" );
1025    charFields.setComparator( "char", new LowerComparator() );
1026
1027    Pipe splice = new GroupBy( "groupby", merge, charFields );
1028
1029    splice = new Every( splice, new Fields( "char" ), new Count() );
1030
1031    Flow flow = getPlatform().getFlowConnector().connect( sources, sink, splice );
1032
1033    flow.complete();
1034
1035    // we can't guarantee if the grouping key will be upper or lower
1036    validateLength( flow, 5, 1, Pattern.compile( "^\\w+\\s2$" ) );
1037    }
1038  }