/*
 * Decompiled with CFR 0.152.
 */
package com.hankcs.hanlp.mining.cluster;

import com.hankcs.hanlp.HanLP;
import com.hankcs.hanlp.classification.utilities.io.ConsoleLogger;
import com.hankcs.hanlp.collection.trie.datrie.MutableDoubleArrayTrieInteger;
import com.hankcs.hanlp.corpus.io.IOUtil;
import com.hankcs.hanlp.dictionary.stopword.CoreStopWordDictionary;
import com.hankcs.hanlp.mining.cluster.Cluster;
import com.hankcs.hanlp.mining.cluster.Document;
import com.hankcs.hanlp.mining.cluster.SparseVector;
import com.hankcs.hanlp.seg.Segment;
import com.hankcs.hanlp.seg.common.Term;
import com.hankcs.hanlp.utility.MathUtility;
import java.io.File;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.ListIterator;
import java.util.Map;
import java.util.PriorityQueue;
import java.util.Set;

public class ClusterAnalyzer<K> {
    protected HashMap<K, Document<K>> documents_ = new HashMap();
    protected Segment segment = HanLP.newSegment();
    protected MutableDoubleArrayTrieInteger vocabulary = new MutableDoubleArrayTrieInteger();
    static final int NUM_REFINE_LOOP = 30;

    protected int id(String word) {
        int id = this.vocabulary.get(word);
        if (id == -1) {
            id = this.vocabulary.size();
            this.vocabulary.put(word, id);
        }
        return id;
    }

    protected List<String> preprocess(String document) {
        List<Term> termList = this.segment.seg(document);
        ListIterator<Term> listIterator = termList.listIterator();
        while (listIterator.hasNext()) {
            Term term = listIterator.next();
            if (!CoreStopWordDictionary.contains(term.word) && !term.nature.startsWith("w")) continue;
            listIterator.remove();
        }
        ArrayList<String> wordList = new ArrayList<String>(termList.size());
        for (Term term : termList) {
            wordList.add(term.word);
        }
        return wordList;
    }

    protected SparseVector toVector(List<String> wordList) {
        SparseVector vector = new SparseVector();
        for (String word : wordList) {
            int id = this.id(word);
            Double f = vector.get(id);
            if (f == null) {
                f = 1.0;
                vector.put(id, f);
                continue;
            }
            f = f + 1.0;
            vector.put(id, f);
        }
        return vector;
    }

    public Document<K> addDocument(K id, String document) {
        return this.addDocument(id, this.preprocess(document));
    }

    public Document<K> addDocument(K id, List<String> document) {
        SparseVector vector = this.toVector(document);
        Document<K> d = new Document<K>(id, vector);
        return this.documents_.put(id, d);
    }

    public List<Set<K>> kmeans(int nclusters) {
        Cluster cluster = new Cluster();
        for (Document<K> document : this.documents_.values()) {
            cluster.add_document(document);
        }
        cluster.section(nclusters);
        this.refine_clusters(cluster.sectioned_clusters());
        ArrayList<Cluster<K>> clusters_ = new ArrayList<Cluster<K>>(nclusters);
        for (Cluster s : cluster.sectioned_clusters()) {
            s.refresh();
            clusters_.add(s);
        }
        return this.toResult(clusters_);
    }

    private List<Set<K>> toResult(List<Cluster<K>> clusters_) {
        ArrayList<Set<K>> result = new ArrayList<Set<K>>(clusters_.size());
        for (Cluster<K> c : clusters_) {
            HashSet s = new HashSet();
            for (Document d : c.documents_) {
                s.add(d.id_);
            }
            result.add(s);
        }
        return result;
    }

    public List<Set<K>> repeatedBisection(int nclusters) {
        return this.repeatedBisection(nclusters, 0.0);
    }

    public List<Set<K>> repeatedBisection(double limit_eval) {
        return this.repeatedBisection(0, limit_eval);
    }

    public List<Set<K>> repeatedBisection(int nclusters, double limit_eval) {
        Cluster cluster = new Cluster();
        ArrayList<Cluster<K>> clusters_ = new ArrayList<Cluster<K>>(nclusters > 0 ? nclusters : 16);
        for (Document<K> document : this.documents_.values()) {
            cluster.add_document(document);
        }
        PriorityQueue que = new PriorityQueue();
        cluster.section(2);
        this.refine_clusters(cluster.sectioned_clusters());
        cluster.set_sectioned_gain();
        cluster.composite_vector().clear();
        que.add(cluster);
        while (!(que.isEmpty() || nclusters > 0 && que.size() >= nclusters || (cluster = (Cluster)que.peek()).sectioned_clusters().size() < 1 || limit_eval > 0.0 && cluster.sectioned_gain() < limit_eval)) {
            que.poll();
            List sectioned = cluster.sectioned_clusters();
            for (Cluster c : sectioned) {
                c.section(2);
                this.refine_clusters(c.sectioned_clusters());
                c.set_sectioned_gain();
                if (c.sectioned_gain() < limit_eval) {
                    for (Cluster sub : c.sectioned_clusters()) {
                        sub.clear();
                    }
                }
                c.composite_vector().clear();
                que.add(c);
            }
        }
        while (!que.isEmpty()) {
            clusters_.add(0, (Cluster<K>)que.poll());
        }
        return this.toResult(clusters_);
    }

    double refine_clusters(List<Cluster<K>> clusters) {
        double[] norms = new double[clusters.size()];
        int offset = 0;
        for (Cluster<K> cluster : clusters) {
            norms[offset++] = cluster.composite_vector().norm();
        }
        double eval_cluster = 0.0;
        int loop_count = 0;
        while (loop_count++ < 30) {
            ArrayList<int[]> items = new ArrayList<int[]>(this.documents_.size());
            for (int i = 0; i < clusters.size(); ++i) {
                int j = 0;
                while (j < clusters.get(i).documents().size()) {
                    items.add(new int[]{i, j++});
                }
            }
            Collections.shuffle(items);
            boolean changed = false;
            for (int[] nArray : items) {
                int cluster_id = nArray[0];
                int item_id = nArray[1];
                Cluster<K> cluster = clusters.get(cluster_id);
                Document<K> doc = cluster.documents().get(item_id);
                double value_base = this.refined_vector_value(cluster.composite_vector(), doc.feature(), -1);
                double norm_base_moved = Math.pow(norms[cluster_id], 2.0) + value_base;
                norm_base_moved = norm_base_moved > 0.0 ? Math.sqrt(norm_base_moved) : 0.0;
                double eval_max = -1.0;
                double norm_max = 0.0;
                int max_index = 0;
                for (int j = 0; j < clusters.size(); ++j) {
                    if (cluster_id == j) continue;
                    Cluster<K> other = clusters.get(j);
                    double value_target = this.refined_vector_value(other.composite_vector(), doc.feature(), 1);
                    double norm_target_moved = Math.pow(norms[j], 2.0) + value_target;
                    norm_target_moved = norm_target_moved > 0.0 ? Math.sqrt(norm_target_moved) : 0.0;
                    double eval_moved = norm_base_moved + norm_target_moved - norms[cluster_id] - norms[j];
                    if (!(eval_max < eval_moved)) continue;
                    eval_max = eval_moved;
                    norm_max = norm_target_moved;
                    max_index = j;
                }
                if (!(eval_max > 0.0)) continue;
                eval_cluster += eval_max;
                clusters.get(max_index).add_document(doc);
                clusters.get(cluster_id).remove_document(item_id);
                norms[cluster_id] = norm_base_moved;
                norms[max_index] = norm_max;
                changed = true;
            }
            if (!changed) break;
            for (Cluster cluster : clusters) {
                cluster.refresh();
            }
        }
        return eval_cluster;
    }

    double refined_vector_value(SparseVector composite, SparseVector vec, int sign) {
        double sum = 0.0;
        for (Map.Entry entry : vec.entrySet()) {
            sum += Math.pow((Double)entry.getValue(), 2.0) + (double)(sign * 2) * composite.get(entry.getKey()) * (Double)entry.getValue();
        }
        return sum;
    }

    public static double evaluate(String folderPath, String algorithm) {
        if (folderPath == null) {
            throw new IllegalArgumentException("\u53c2\u6570 folderPath == null");
        }
        File root = new File(folderPath);
        if (!root.exists()) {
            throw new IllegalArgumentException(String.format("\u76ee\u5f55 %s \u4e0d\u5b58\u5728", root.getAbsolutePath()));
        }
        if (!root.isDirectory()) {
            throw new IllegalArgumentException(String.format("\u76ee\u5f55 %s \u4e0d\u662f\u4e00\u4e2a\u76ee\u5f55", root.getAbsolutePath()));
        }
        ClusterAnalyzer<String> analyzer = new ClusterAnalyzer<String>();
        File[] folders = root.listFiles();
        if (folders == null) {
            return 1.0;
        }
        ConsoleLogger.logger.start("\u6839\u76ee\u5f55:%s\n\u52a0\u8f7d\u4e2d...\n", folderPath);
        int docSize = 0;
        int[] ni = new int[folders.length];
        String[] cat = new String[folders.length];
        int offset = 0;
        for (File folder : folders) {
            String category;
            File[] files;
            if (folder.isFile() || (files = folder.listFiles()) == null) continue;
            cat[offset] = category = folder.getName();
            ConsoleLogger.logger.out("[%s]...", category);
            int b = 0;
            int e = files.length;
            int logEvery = (int)Math.ceil((float)(e - b) / 10000.0f);
            for (int i = b; i < e; ++i) {
                analyzer.addDocument(folder.getName() + " " + files[i].getName(), IOUtil.readTxt(files[i].getAbsolutePath()));
                if (i % logEvery == 0) {
                    ConsoleLogger.logger.out("%c[%s]...%.2f%%", 13, category, MathUtility.percentage(i - b + 1, e - b));
                }
                ++docSize;
                int n = offset;
                ni[n] = ni[n] + 1;
            }
            ConsoleLogger.logger.out(" %d \u7bc7\u6587\u6863\n", e - b);
            ++offset;
        }
        ConsoleLogger.logger.finish(" \u52a0\u8f7d\u4e86 %d \u4e2a\u7c7b\u76ee,\u5171 %d \u7bc7\u6587\u6863\n", folders.length, docSize);
        ConsoleLogger.logger.start(algorithm + "\u805a\u7c7b\u4e2d...", new Object[0]);
        List clusterList = algorithm.replaceAll("[-\\s]", "").toLowerCase().equals("kmeans") ? analyzer.kmeans(ni.length) : analyzer.repeatedBisection(ni.length);
        ConsoleLogger.logger.finish(" \u5b8c\u6bd5\u3002\n", new Object[0]);
        double[] fi = new double[ni.length];
        for (int i = 0; i < ni.length; ++i) {
            for (Set j : clusterList) {
                int nij = 0;
                for (String d : j) {
                    if (!d.startsWith(cat[i])) continue;
                    ++nij;
                }
                if (nij == 0) continue;
                double p = (double)nij / (double)j.size();
                double r = (double)nij / (double)ni[i];
                double f = 2.0 * p * r / (p + r);
                fi[i] = Math.max(fi[i], f);
            }
        }
        double f = 0.0;
        for (int i = 0; i < fi.length; ++i) {
            f += fi[i] * (double)ni[i] / (double)docSize;
        }
        return f;
    }
}

