Summarization - Posts

Importre
Importnltk
Fromnltk.corpusimportstopwords
Fromnltk.tokenizeimportsent_tokenize,word_tokenize
Fromheapqimportnlargest
#Sampletextparagraphyoucanwriteanytext
Text=“Naturallanguageprocessing(NLP)isasubfieldoflinguistics,computerscience,
informationengineering,andartificialintelligenceconcernedwiththeinteractionsbetween
computersandhumanlanguages,inparticularhowtoprogramcomputerstoprocessand
analyzelargeamountsofnaturallanguagedata.Challengesinnaturallanguageprocessing
frequentlyinvolvespeechrecognition,naturallanguageunderstanding,andnaturallanguage
generation.Thehistoryofnaturallanguageprocessinggenerallystartedinthe1950s,although
workcanbefoundfromearlierperiods.”
#Removespecialcharactersanddigits
Text=re.sub(‘[^a-zA-Z]’,‘‘,text)
#Tokenizethetextintosentences
Sentences=sent_tokenize(text)
#Tokenizeeachsentenceintowordsandremovestopwords
Stop_words=set(stopwords.words(‘english’))
Words=[]
Forsentenceinsentences:
Words.extend(word_tokenize(sentence))
Words=[word.lower()forwordinwordsifword.lower()notinstop_words]
#Calculatewordfrequency
Word_freq=nltk.FreqDist(words)
#Calculatesentencescoresbasedonwordfrequency
Sentence_scores={}
Forsentenceinsentences:
Forwordinword_tokenize(sentence.lower()):
Ifwordinword_freq:
Iflen(sentence.split(‘‘))<30:
Ifsentencenotinsentence_scores:
Sentence_scores[sentence]=word_freq[word]
Else:
Sentence_scores[sentence]+=word_freq[word]
#Generatesummarybyselectingtop3sentenceswithhighestscores
Summary_sentences=nlargest(3,sentence_scores,key=sentence_scores.get)
Summary=‘‘.join(summary_sentences)
Print(summary