001/*
002# Licensed Materials - Property of IBM
003# Copyright IBM Corp. 2015  
004 */
005package parallel;
006
007import static com.ibm.streamsx.topology.file.FileStreams.directoryWatcher;
008import static com.ibm.streamsx.topology.file.FileStreams.textFileReader;
009import static com.ibm.streamsx.topology.logic.Value.of;
010
011import java.io.ObjectStreamException;
012import java.util.logging.Logger;
013import java.util.regex.Matcher;
014import java.util.regex.Pattern;
015
016import com.ibm.streamsx.topology.TStream;
017import com.ibm.streamsx.topology.Topology;
018import com.ibm.streamsx.topology.context.StreamsContextFactory;
019import com.ibm.streamsx.topology.function.Predicate;
020
021/**
022 * PartitionedParallelRegexGrep is like ParallelRegexGrep, except that the Java
023 * object in the tuple being passed into the parallel region implements the
024 * Keyable interface, and provides a getKey() function which is used to map
025 * tuples to their corresponding channel in the parallel region.
026 * 
027 * Each channel of the parallel region only receives tuples that have the same
028 * hashCode() value of the Key returned by the tuple value's getKey() method. In
029 * other words, for each tuple, the value returned by
030 * 
031 * tupleValue.getKey().hashCode()
032 * 
033 * will go to the same channel, for each tuple which returns that result. To
034 * show this, instead of passing a java.lang.String into the parallel region, a
035 * stringWrapper class is created that implements the Keyable interface.
036 * 
037 * For this sample, if you read from a file that contains the following:
038 * 
039 * Apple Orange Banana Banana Apple Apple
040 * 
041 * you notice that the lines containing Apple will always be sent to the same
042 * channel of the parallel region; the same for the lines containing Orange and
043 * Banana.
044 * 
045 * 
046 */
047public class PartitionedParallelRegexGrep {
048    static final Logger trace = Logger
049            .getLogger("samples.partitionedparallelregexgrep");
050
051    @SuppressWarnings("serial")
052    public static void main(String[] args) throws Exception {
053        String contextType = args[0];
054        String directory = args[1];
055        final Pattern pattern = Pattern.compile(args[2]);
056
057        // Define the topology
058        Topology topology = new Topology("PartitionedParallelRegexGrep");
059
060        // All streams with tuples that are Java String objects
061        TStream<String> files = directoryWatcher(topology, directory);
062        TStream<String> lines = textFileReader(files);
063
064        // Begin parallel region
065        TStream<String> parallelLines = lines
066                .parallel(of(5), TStream.Routing.HASH_PARTITIONED);
067        TStream<String> ParallelFiltered = parallelLines
068                .filter(new Predicate<String>() {
069
070                    @Override
071                    public boolean test(String v1) {
072                        // If you inspect the output of the streams in this
073                        // parallel
074                        // region, you will see that any string that is sent to
075                        // one
076                        // channel will not be sent to another. In other words,
077                        // if you
078                        // see "apple" being sent to this channel, you will
079                        // never see
080                        // "apple" being sent to any other channel.
081                        trace.info("Testing  string \"" + v1
082                                + "\" for the pattern.");
083                        // Pass the line through if it matches the
084                        // regular expression pattern
085                        return matcher.reset(v1).matches();
086                    }
087
088                    transient Matcher matcher;
089
090                    private Object readResolve() throws ObjectStreamException {
091                        matcher = pattern.matcher("");
092                        return this;
093                    }
094                });
095
096        // Combine the results of each parallel filter into one stream, ending
097        // the parallel region.
098        TStream<String> filtered_condensed = ParallelFiltered
099                .endParallel();
100
101        // Print the combined results
102        filtered_condensed.print();
103
104        // Execute the topology
105        StreamsContextFactory.getStreamsContext(contextType).submit(topology);
106    }
107}