import { ClusterEntity } from "../api/getUrlBatch";
import { isQuestion } from "./text";

/**
 * Get topics for SERP from clusters and process the entity values.
 * @param clusters - Array of different clusters
 * @return list - Array of processed entity values sorted by item count
 */
export const getTopics = (clusters: ClusterEntity[]): ClusterEntity[] => {
  let list: ClusterEntity[] = [];

  clusters.forEach((cluster) => {
    cluster.cluster_entities.forEach((value) => {
      const processedValue = processEntityValue(value, cluster.count);
      list.push(processedValue);
    });
  });

  list.sort(sortByItemCount);

  return list;
};

/**
 * Process an entity value from a cluster, adjusting its properties
 * @param value - Entity value object
 * @param clusterCount - Count of entities in the cluster
 * @return value - Processed entity value object
 */
const processEntityValue = (
  value: ClusterEntity,
  clusterCount: number
): ClusterEntity => {
  value.frequency = parseInt(value.frequency.toString());
  value.header_frequency = parseInt(value.header_frequency.toString());

  if (value.header_frequency < 1) {
    value.header_frequency = 1;
  }

  if (value.title_count > 0) {
    value.title_frequency = 1;
  }

  value.long_tail = value.entity.split(" ").length > 1;
  value.net_score = getTopicScore(value);

  return value;
};

/**
 * Process the brief and return the processed data.
 * @param processed_items - The items to be processed.
 * @param clusters - The clusters data.
 * @param query - The query string.
 * @param lang - The language.
 * @returns - The processed data.
 */
export type ProcessedItem = {
  word_count?: number;
  url: string;
  assets?: any[];
  images?: any[];
  links?: any[];
};

type Cluster = {
  count: number;
  cluster_entities: any[];
};

export const processBrief = (
  processed_items: ProcessedItem[],
  clusters: Cluster[],
  query: string,
  lang: string
) => {
  let word_count = 0;
  let valid_items_count = 0;
  let question_list = [];
  let section_list = [];
  let section_count = 0;
  let img_count = 0;
  let img_count_items = 0;
  let link_map = {};
  let domain_map = {};
  let link_count = 0;
  processed_items?.forEach((item, index) => {
    if (item.word_count) {
      word_count += item.word_count;
      valid_items_count += 1;
    } else {
      item.word_count = 0;
    }
    let h_section_count = 0;
    let processed_links = [];
    if (item.assets) {
      let header_map = {};
      item.assets.forEach((value, index) => {
        if (
          value.header &&
          !header_map[value.header.toLowerCase()] &&
          value.header.length > 0 &&
          value.header.length < 120
        ) {
          const is_question = isQuestion(value.header, true);
          const modified_value = { ...value, url: item.url };
          if (is_question) {
            question_list.push(modified_value);
          } else if (value.header) {
            section_list.push(modified_value);
          }
          if (
            value &&
            value.header_tag &&
            value.header_tag !== "h1" &&
            value.header_tag.indexOf("h") !== -1
          ) {
            h_section_count += 1;
          }
          header_map[value.header.toLowerCase()] = true;
        }
        value.html.forEach((htmlString, index) => {
          const tempElement = document.createElement("div");
          tempElement.innerHTML = htmlString;

          if (typeof tempElement.querySelectorAll === "function") {
            const a_elem = tempElement.querySelectorAll("a");
            a_elem.forEach((value, index) => {
              if (value.href) {
                let url: string;
                if (value.host === document.location.host) {
                  const domain = breakDownURL(item.url);
                  url = "https://" + domain + "" + value.pathname + "";
                } else {
                  url = value.href;
                }
                url = url.split("?")[0];
                url = url.split("#")[0];
                const parent = value.parentElement.textContent;
                if (
                  validateUrl(url) === true &&
                  parent.length > 20 &&
                  value.textContent &&
                  value.textContent.trim().length > 0
                ) {
                  const classed_text = classParagraph(
                    parent,
                    value.textContent,
                    "span"
                  );
                  processed_links.push({
                    anchor: value.textContent,
                    url: url,
                    classed_text: classed_text,
                    raw_text: parent,
                  });
                }
              }
            });
          }
        });
      });
    }
    if (h_section_count > 10) {
      h_section_count = 10;
    }
    section_count += h_section_count;
    item.links = processed_links;
    if (item.links && item.links.length > 0) {
      const item_domain = breakDownURL(item.url);
      item.links.forEach((value, index) => {
        const domain = breakDownURL(value.url);
        if (!domain_map[domain]) {
          domain_map[domain] = {
            items: {},
            internal_links: [],
            external_links: [],
          };
        }
        if (!link_map[value.url]) {
          if (item_domain === domain) {
            domain_map[domain].internal_links.push(value);
          } else {
            domain_map[domain].external_links.push(value);
            link_count += 1;
          }
          link_map[value.url] = true;
        }
        domain_map[domain].items[item.url] = true;
      });
    }
    if (item.images && item.images.length > 0) {
      img_count += item.images.length;
      img_count_items += 1;
    }
  });
  const topics = getTopicsForSERP(clusters);
  const resp = {
    avg_img_count: (img_count / img_count_items).toFixed(0),
    avg_link_count: (link_count / valid_items_count).toFixed(0),
    avg_word_count: (word_count / valid_items_count).toFixed(0),
    avg_section_count: (section_count / valid_items_count).toFixed(0),
    domain_map: domain_map,
    question_list: question_list,
    section_list: section_list,
    sources_processed_count: valid_items_count,
    processed_items: processed_items,
    query: query,
    lang: lang,
    clusters: clusters,
    topics: topics,
  };
  return resp;
};

export const removeAccents = (str: string) => {
  str = str.normalize("NFD").replace(/[\u0300-\u036f]/g, "");
  return str;
};

export const breakDownURL = (url: string) => {
  if (url !== null) {
    let domain = "";
    if (url.indexOf("http://") === 0) {
      url = url.substr(7);
    } else if (url.indexOf("https://") === 0) {
      url = url.substr(8);
    }
    if (url.indexOf("www.") === 0) {
      url = url.substr(4);
    }
    domain = url.split("/")[0];
    return domain;
  }
};

const classParagraph = (paragraph: string, sentence: string, tag: string) => {
  let classed_paragraph = "";
  if (paragraph && paragraph.length > 0 && sentence && sentence.length > 0) {
    paragraph = paragraph.trim();
    sentence = sentence.trim();
    if (paragraph === sentence) {
      classed_paragraph = "<span class='ent'>" + paragraph + "</span>";
    } else {
      let searchRegExp: RegExp;
      try {
        searchRegExp = new RegExp("(\\b" + sentence + "\\b)", "gi");
      } catch (err) {
        return paragraph;
      }
      classed_paragraph = paragraph.replace(
        searchRegExp,
        "<span class='ent'>$1</span>"
      );
    }
    return classed_paragraph;
  } else {
    return classed_paragraph;
  }
};

const getTopicsForSERP = (clusters: Cluster[]) => {
  let list = [];
  clusters?.forEach((c) => {
    c.cluster_entities.forEach((value) => {
      const frequency = parseInt(value.frequency);
      const header_frequency =
        value.header_frequency > 0 && value.header_frequency < 1
          ? 1
          : parseInt(value.header_frequency);
      const title_frequency = value.title_count > 0 ? 1 : value.title_frequency;
      const long_tail = value.entity.split(" ").length > 1;
      const net_score = getTopicScore(value, c.count);

      const newValue = {
        ...value,
        frequency,
        header_frequency,
        title_frequency,
        long_tail,
        net_score,
      };

      list.push(newValue);
    });
  });
  list.sort(sortByItemCount);
  return list;
};

const getTopicScore = (value: any, count: number) => {
  let word_count = value.entity.split(" ").length;
  if (word_count > 3) {
    word_count = 3;
  }
  const score =
    (word_count + value.title_count + value.header_count + value.frequency) *
    value.item_count;
  return score;
};

const sortByItemCount = (a: any, b: any) => {
  if (a.item_count < b.item_count) return 1;
  if (a.item_count > b.item_count) return -1;
  return 0;
};

function validateUrl(url) {
  if (url.indexOf("http") == -1) {
    url = "https://" + url + "";
  }
  var re = new RegExp(
    "^" +
      // protocol identifier
      "(?:(?:https?|ftp)://)" +
      // user:pass authentication
      "(?:\\S+(?::\\S*)?@)?" +
      "(?:" +
      // IP address exclusion
      // private & local networks
      "(?!(?:10|127)(?:\\.\\d{1,3}){3})" +
      "(?!(?:169\\.254|192\\.168)(?:\\.\\d{1,3}){2})" +
      "(?!172\\.(?:1[6-9]|2\\d|3[0-1])(?:\\.\\d{1,3}){2})" +
      // IP address dotted notation octets
      // excludes loopback network 0.0.0.0
      // excludes reserved space >= 224.0.0.0
      // excludes network & broacast addresses
      // (first & last IP address of each class)
      "(?:[1-9]\\d?|1\\d\\d|2[01]\\d|22[0-3])" +
      "(?:\\.(?:1?\\d{1,2}|2[0-4]\\d|25[0-5])){2}" +
      "(?:\\.(?:[1-9]\\d?|1\\d\\d|2[0-4]\\d|25[0-4]))" +
      "|" +
      // host name
      "(?:(?:[a-z\\u00a1-\\uffff0-9]-*)*[a-z\\u00a1-\\uffff0-9]+)" +
      // domain name
      "(?:\\.(?:[a-z\\u00a1-\\uffff0-9]-*)*[a-z\\u00a1-\\uffff0-9]+)*" +
      // TLD identifier
      "(?:\\.(?:[a-z\\u00a1-\\uffff]{2,}))" +
      // TLD may end with dot
      "\\.?" +
      ")" +
      // port number
      "(?::\\d{2,5})?" +
      // resource path
      "(?:[/?#]\\S*)?" +
      "$",
    "i"
  );
  return re.test(url);
}
